Implemented washinsa parser in python

This commit is contained in:
Arnaud Vergnet 2020-06-24 18:22:14 +02:00
parent 4cbc13dd01
commit 6a072cb4e2
8 changed files with 321 additions and 181 deletions

102
__examples__/washinsa.json Normal file
View file

@ -0,0 +1,102 @@
{
"dryers": [
{
"number": "1",
"state": "DISPONIBLE",
"startTime": "",
"endTime": "",
"donePercent": "",
"remainingTime": ""
},
{
"number": "2",
"state": "EN COURS",
"startTime": "15:24",
"endTime": "15:59",
"donePercent": "62.9",
"remainingTime": "13"
},
{
"number": "3",
"state": "TERMINE",
"startTime": "",
"endTime": "",
"donePercent": "",
"remainingTime": ""
}
],
"washers": [
{
"number": "4",
"state": "EN COURS",
"startTime": "\u00a0",
"endTime": "\u00a0",
"donePercent": "",
"remainingTime": "0"
},
{
"number": "5",
"state": "EN COURS",
"startTime": "15:35",
"endTime": "16:15",
"donePercent": "26.9",
"remainingTime": "29"
},
{
"number": "6",
"state": "DISPONIBLE",
"startTime": "",
"endTime": "",
"donePercent": "",
"remainingTime": ""
},
{
"number": "7",
"state": "EN COURS",
"startTime": "15:34",
"endTime": "16:19",
"donePercent": "26",
"remainingTime": "33"
},
{
"number": "8",
"state": "DISPONIBLE",
"startTime": "",
"endTime": "",
"donePercent": "",
"remainingTime": ""
},
{
"number": "9",
"state": "DISPONIBLE",
"startTime": "",
"endTime": "",
"donePercent": "",
"remainingTime": ""
},
{
"number": "10",
"state": "DISPONIBLE",
"startTime": "",
"endTime": "",
"donePercent": "",
"remainingTime": ""
},
{
"number": "11",
"state": "DISPONIBLE",
"startTime": "",
"endTime": "",
"donePercent": "",
"remainingTime": ""
},
{
"number": "12",
"state": "EN COURS",
"startTime": "15:02",
"endTime": "15:47",
"donePercent": "97.7",
"remainingTime": "1"
}
]
}

View file

@ -1 +0,0 @@
{"dryers":[{"number":"1","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"2","state":"EN COURS","startTime":"15:24","endTime":"15:59","donePercent":"62.9","remainingTime":"13"},{"number":"3","state":"TERMINE","startTime":"","endTime":"","donePercent":"","remainingTime":""}],"washers":[{"number":"4","state":"EN COURS","startTime":"\u00a0","endTime":"\u00a0","donePercent":"","remainingTime":"0"},{"number":"5","state":"EN COURS","startTime":"15:35","endTime":"16:15","donePercent":"26.9","remainingTime":"29"},{"number":"6","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"7","state":"EN COURS","startTime":"15:34","endTime":"16:19","donePercent":"26","remainingTime":"33"},{"number":"8","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"9","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"10","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"11","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"12","state":"EN COURS","startTime":"15:02","endTime":"15:47","donePercent":"97.7","remainingTime":"1"}]}

View file

@ -1,179 +0,0 @@
<?php
/**
* Wash-INSA PARSER
* This program get all information about the laundry
* Author : Gabin NOBLET (Promo 55)
* Date : 07/2019
* Version 1.0
*/
/**Get Type of machines (1 for dryers, 2 for washers, 0 else)
* @param $str (string)
* @return int
*/
function mType($str)
{
$a = explode(' ', $str);
if (substr($a[0], -5) == "SECHE") {
return 1;
} elseif (substr($a[0], -4) == "LAVE") {
return 2;
} else {
return 0;
}
}
/**Get Status of machines (1 for available, 2 for current, 3 for finished, 0 for out of order)
* @param $node (DOMNode)
* @return int
*/
function mStatus($node)
{
$a = explode(' ', $node[2]->nodeValue);
if (substr($a[0], -10) == "DISPONIBLE") {
return 1;
} elseif (substr($a[0], -7) == "TERMINE") {
return 3;
} elseif (isset($node[4])) { //If there is time information, it is current
return 2;
} else {
return 0;
}
}
/** Clean strings (removes spaces, tabs end return before and behind the string)
* @param $str (string)
* @return string
*/
function clean($str)
{
return rtrim(ltrim(htmlspecialchars_decode($str)));
}
function generateJson()
{
$page = new DOMDocument();
$page->loadHTMLFile("https://www.proxiwash.com/weblaverie/ma-laverie-2?s=cf4f39&16d33a57b3fb9a05d4da88969c71de74=1"); //Get page
$division = $page->getElementById("liste-machines");
$liste_machines = $division->childNodes[0]->childNodes;
//Remove titles
$titles = $liste_machines->item(0);
$titles->parentNode->removeChild($titles);
$return = array(
'dryers' => array(),
'washers' => array()
);
foreach ($liste_machines as $machines) {
$specs = $machines->childNodes;
//Format array
$template = array(
'number' => "",
'state' => "",
'startTime' => "",
'endTime' => "",
'donePercent' => "",
'remainingTime' => ""
);
//Get info from HTML
$specsArray = array();
foreach ($specs as $info) {
if (isset($info->tagName)) {
if ($info->tagName == "td") {
array_push($specsArray, $info);
}
}
}
//Get Type
$type = mType(clean($specsArray[0]->nodeValue));
//Get Number
preg_match('/\d{1,2}/', $specsArray[1]->nodeValue, $number);
$template['number'] = $number[0];
//Get Status and DonePercent
$status = mStatus($specsArray);
switch ($status) {
case 1:
$template['state'] = "DISPONIBLE";
break;
case 2:
$template['state'] = "EN COURS";
foreach ($specsArray[2]->childNodes as $child) {
if (isset($child->tagName)) {
if ($child->tagName == "table") {
$progressBar = $child->childNodes[0]->childNodes[0];
if ($progressBar->getAttribute('bgcolor') == "Green") {
$template['donePercent'] = substr($progressBar->getAttribute('width'), 0, -1);
}
}
}
}
break;
case 3:
$template['state'] = "TERMINE";
break;
default :
$template['state'] = "HORS SERVICE";
break;
}
if ($template['state'] === "EN COURS") { // We set Times only when they could exist
//Get StartTime
$template['startTime'] = $specsArray[4]->nodeValue;
//Get EndTime
$template['endTime'] = $specsArray[5]->nodeValue;
$template['remainingTime'] = get_remaining_time($template['startTime'], $template['endTime'], $template['donePercent']);
}
if ($type == 1) {
array_push($return['dryers'], $template);
} elseif ($type == 2) {
array_push($return['washers'], $template);
}
// echo "<pre>";
// echo "Type : ".$type."\n" ;
// print_r($template);
// echo "</pre>";
}
$jsonData = json_encode($return);
file_put_contents('washinsa.json', $jsonData);
}
/**
* Get remaining time for the current machine
*
* @param $startTime
* @param $endTime
* @return string
*/
function get_remaining_time($startTime, $endTime, $percentDone)
{
$startArray = explode(':', $startTime);
$endArray = explode(':', $endTime);
$unixStart = mktime($startArray[0], $startArray[1], 0, 1, 0);
$unixEnd = mktime($endArray[0], $endArray[1], 0, 1, 0);
if ($unixStart > $unixEnd) { // Machine ends the following day
$unixEnd = mktime($endArray[0], $endArray[1], 0, 1, 1);
}
$deltaMinutes = ($unixEnd - $unixStart)/60;
$remainingTime = $deltaMinutes * (1 - $percentDone/100);
return strval(round($remainingTime, 0));
}
generateJson();

View file

@ -0,0 +1,218 @@
# Parser made with BeautifulSoup4
# https://www.crummy.com/software/BeautifulSoup/bs4/doc
from bs4 import BeautifulSoup
import urllib.request
from enum import Enum, auto
import pprint
import re
import json
'''
PAGE STRUCTURE
as of june 2020
A table with a row (tr html tag) for each machine
Each machine row is composed of 6 columns
- 1 - Type ("SECHE LINGE 14 KG" or "LAVE LINGE 6 KG")
- 2 - Number ("No X" with X the current number)
- 3 - Status (A string showing the status or a table showing the progress)
- 4 - Program (Name of the program or empty)
- 5 - Start time (The start time in format HH:MM or empty)
- 6 - End time (The end time in format HH:MM or empty)
'''
DUMP_FILE = "washinsa_data.json"
WASHINSA_URL = "https://www.proxiwash.com/weblaverie/component/weblaverie/?view=instancesfiche&format=raw&s=cf4f39"
DRYER_STRING = "SECHE LINGE"
class State(Enum):
AVAILABLE = 0
RUNNING = 1
RUNNING_NOT_STARTED = 2
FINISHED = 3
UNAVAILABLE = 4
ERROR = 5
UNKNOWN = 6
# Table used to convert state string given by the page into State enum
STATE_CONVERSION_TABLE = {
"DISPONIBLE": State.AVAILABLE,
"TERMINE": State.FINISHED,
"HORS SERVICE": State.UNAVAILABLE,
"ERREUR": State.ERROR,
}
TIME_RE = re.compile("^\d\d:\d\d$")
def download_page():
"""
Downloads the page from proxiwash website
"""
try:
with urllib.request.urlopen(WASHINSA_URL) as response:
return response.read().decode()
except:
print("Error processing following url: " + WASHINSA_URL)
return ""
def get_rows(soup):
"""
Gets rows corresponding to machines on the page
"""
rows = soup.table.contents
del rows[0]
return rows
def is_machine_dryer(row):
"""
Checks if the given machine is a dryer. If it is not, it means it is a washer.
To check this, we look if the test string is included in the state string
"""
return DRYER_STRING in row.contents[0].text
def get_machine_number(row):
"""
Gets the current machine number.
To find it, we look in the Number cell and remove the "No" prefix to keep only the number.
We do not rely on the index in the list as it could get lost in parsing.
Plus, this method allows for non numeric machine identifiers.
"""
return row.contents[1].text.split()[1]
def get_machine_state(row):
"""
Gets the current machine state.
The state is usually written in plain text inside the State cell.
In this case, we simply check it against the state translation table,
to turn it into a State Enum, which is easier to manipulate.
In some cases, this plain text is replace by a progress bar.
This the machine is running.
If the state string cannot be recognized, this returns the unknown state.
"""
content = row.contents[2].contents[0]
state = State.UNKNOWN
if content.name == "table":
state = State.RUNNING
else:
for key in STATE_CONVERSION_TABLE:
if key in content.text:
state = STATE_CONVERSION_TABLE[key]
break
return state
def get_machine_program(row):
"""
Gets the machine program as written in plain text in the Program cell
"""
return row.contents[3].text.strip()
def get_machine_times(row):
"""
Gets the start and end time for the machine.
If one of these times is invalid (does not respect the HH:MM format), an empty string is returned
"""
start_time = row.contents[4].text
end_time = row.contents[5].text
if TIME_RE.match(start_time) and TIME_RE.match(end_time):
return start_time, end_time
else:
return "", ""
def get_machine_done_percent(row):
"""
Gets the machine done percentage.
This percent is given in the width property of the first column, in the table inside the State cell.
This only applies if the machine is running.
"""
content = row.contents[2].contents[0]
percent = ""
if content.name == "table":
percent = content.td["width"].replace("%", "")
if float(percent) > 100: # because the website is not doing this check...
percent = '100'
return percent
def get_machine_remaining_time(row):
"""
Gets the remaining time in minutes.
The time is written in the title property of the table inside the State cell.
It is written inside a phrase, so we need to extract it.
This only applies if the machine is running.
"""
content = row.contents[2].contents[0]
time = 0
if content.name == "table":
time = content["title"].split("=")[1].split()[0]
return time
def get_parsed_data(rows):
"""
Gets the parsed data from the web page, farmatting it in a easy to use object
"""
dryers = []
washers = []
for row in rows:
state = get_machine_state(row)
machine = {
"number": get_machine_number(row),
"state": state.value,
"startTime": "",
"endTime": "",
"donePercent": "",
"remainingTime": "",
"program": "",
}
if state == State.RUNNING:
machine_times = get_machine_times(row)
machine["startTime"] = machine_times[0]
machine["endTime"] = machine_times[1]
if len(machine_times[0]) == 0:
state = State.RUNNING_NOT_STARTED
machine["state"] = state.value
machine["program"] = get_machine_program(row)
machine["donePercent"] = get_machine_done_percent(row)
machine["remainingTime"] = get_machine_remaining_time(row)
if is_machine_dryer(row):
dryers.append(machine)
else:
washers.append(machine)
return {
"dryers": dryers,
"washers": washers
}
def main():
soup = BeautifulSoup(download_page(), 'html.parser')
rows = get_rows(soup)
with open(DUMP_FILE, 'w') as f:
json.dump(get_parsed_data(rows), f)
main()

View file

@ -1,5 +1,5 @@
#!/bin/bash #!/bin/bash
touch lock touch lock
php index.php python3 washinsa_handler.py > log 2> err
rm lock rm lock