From 6a072cb4e2cb11106ae2c949daffb77b878eda06 Mon Sep 17 00:00:00 2001 From: Arnaud Vergnet Date: Wed, 24 Jun 2020 18:22:14 +0200 Subject: [PATCH] Implemented washinsa parser in python --- ...oard_data.json.bak => dashboard_data.json} | 0 ...ebook_data.json.bak => facebook_data.json} | 0 .../{menu_data.json.bak => menu_data.json} | 0 __examples__/washinsa.json | 102 ++++++++ __examples__/washinsa.json.bak | 1 - washinsa/index.php | 179 -------------- washinsa/washinsa_handler.py | 218 ++++++++++++++++++ washinsa/washinsa_update.sh | 2 +- 8 files changed, 321 insertions(+), 181 deletions(-) rename __examples__/{dashboard_data.json.bak => dashboard_data.json} (100%) rename __examples__/{facebook_data.json.bak => facebook_data.json} (100%) rename __examples__/{menu_data.json.bak => menu_data.json} (100%) create mode 100644 __examples__/washinsa.json delete mode 100644 __examples__/washinsa.json.bak delete mode 100644 washinsa/index.php create mode 100644 washinsa/washinsa_handler.py diff --git a/__examples__/dashboard_data.json.bak b/__examples__/dashboard_data.json similarity index 100% rename from __examples__/dashboard_data.json.bak rename to __examples__/dashboard_data.json diff --git a/__examples__/facebook_data.json.bak b/__examples__/facebook_data.json similarity index 100% rename from __examples__/facebook_data.json.bak rename to __examples__/facebook_data.json diff --git a/__examples__/menu_data.json.bak b/__examples__/menu_data.json similarity index 100% rename from __examples__/menu_data.json.bak rename to __examples__/menu_data.json diff --git a/__examples__/washinsa.json b/__examples__/washinsa.json new file mode 100644 index 0000000..5d14ce7 --- /dev/null +++ b/__examples__/washinsa.json @@ -0,0 +1,102 @@ +{ + "dryers": [ + { + "number": "1", + "state": "DISPONIBLE", + "startTime": "", + "endTime": "", + "donePercent": "", + "remainingTime": "" + }, + { + "number": "2", + "state": "EN COURS", + "startTime": "15:24", + "endTime": "15:59", + "donePercent": "62.9", + "remainingTime": "13" + }, + { + "number": "3", + "state": "TERMINE", + "startTime": "", + "endTime": "", + "donePercent": "", + "remainingTime": "" + } + ], + "washers": [ + { + "number": "4", + "state": "EN COURS", + "startTime": "\u00a0", + "endTime": "\u00a0", + "donePercent": "", + "remainingTime": "0" + }, + { + "number": "5", + "state": "EN COURS", + "startTime": "15:35", + "endTime": "16:15", + "donePercent": "26.9", + "remainingTime": "29" + }, + { + "number": "6", + "state": "DISPONIBLE", + "startTime": "", + "endTime": "", + "donePercent": "", + "remainingTime": "" + }, + { + "number": "7", + "state": "EN COURS", + "startTime": "15:34", + "endTime": "16:19", + "donePercent": "26", + "remainingTime": "33" + }, + { + "number": "8", + "state": "DISPONIBLE", + "startTime": "", + "endTime": "", + "donePercent": "", + "remainingTime": "" + }, + { + "number": "9", + "state": "DISPONIBLE", + "startTime": "", + "endTime": "", + "donePercent": "", + "remainingTime": "" + }, + { + "number": "10", + "state": "DISPONIBLE", + "startTime": "", + "endTime": "", + "donePercent": "", + "remainingTime": "" + }, + { + "number": "11", + "state": "DISPONIBLE", + "startTime": "", + "endTime": "", + "donePercent": "", + "remainingTime": "" + }, + { + "number": "12", + "state": "EN COURS", + "startTime": "15:02", + "endTime": "15:47", + "donePercent": "97.7", + "remainingTime": "1" + } + ] +} \ No newline at end of file diff --git a/__examples__/washinsa.json.bak b/__examples__/washinsa.json.bak deleted file mode 100644 index 3fa8e24..0000000 --- a/__examples__/washinsa.json.bak +++ /dev/null @@ -1 +0,0 @@ -{"dryers":[{"number":"1","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"2","state":"EN COURS","startTime":"15:24","endTime":"15:59","donePercent":"62.9","remainingTime":"13"},{"number":"3","state":"TERMINE","startTime":"","endTime":"","donePercent":"","remainingTime":""}],"washers":[{"number":"4","state":"EN COURS","startTime":"\u00a0","endTime":"\u00a0","donePercent":"","remainingTime":"0"},{"number":"5","state":"EN COURS","startTime":"15:35","endTime":"16:15","donePercent":"26.9","remainingTime":"29"},{"number":"6","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"7","state":"EN COURS","startTime":"15:34","endTime":"16:19","donePercent":"26","remainingTime":"33"},{"number":"8","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"9","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"10","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"11","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"12","state":"EN COURS","startTime":"15:02","endTime":"15:47","donePercent":"97.7","remainingTime":"1"}]} \ No newline at end of file diff --git a/washinsa/index.php b/washinsa/index.php deleted file mode 100644 index 9190e39..0000000 --- a/washinsa/index.php +++ /dev/null @@ -1,179 +0,0 @@ -nodeValue); - if (substr($a[0], -10) == "DISPONIBLE") { - return 1; - } elseif (substr($a[0], -7) == "TERMINE") { - return 3; - } elseif (isset($node[4])) { //If there is time information, it is current - return 2; - } else { - return 0; - } -} - -/** Clean strings (removes spaces, tabs end return before and behind the string) - * @param $str (string) - * @return string - */ -function clean($str) -{ - return rtrim(ltrim(htmlspecialchars_decode($str))); -} - - -function generateJson() -{ - $page = new DOMDocument(); - $page->loadHTMLFile("https://www.proxiwash.com/weblaverie/ma-laverie-2?s=cf4f39&16d33a57b3fb9a05d4da88969c71de74=1"); //Get page - - - $division = $page->getElementById("liste-machines"); - $liste_machines = $division->childNodes[0]->childNodes; - -//Remove titles - $titles = $liste_machines->item(0); - $titles->parentNode->removeChild($titles); - - - $return = array( - 'dryers' => array(), - 'washers' => array() - ); - - foreach ($liste_machines as $machines) { - - $specs = $machines->childNodes; - - //Format array - $template = array( - 'number' => "", - 'state' => "", - 'startTime' => "", - 'endTime' => "", - 'donePercent' => "", - 'remainingTime' => "" - ); - - //Get info from HTML - $specsArray = array(); - foreach ($specs as $info) { - if (isset($info->tagName)) { - if ($info->tagName == "td") { - array_push($specsArray, $info); - } - } - } - - //Get Type - $type = mType(clean($specsArray[0]->nodeValue)); - - //Get Number - preg_match('/\d{1,2}/', $specsArray[1]->nodeValue, $number); - $template['number'] = $number[0]; - - //Get Status and DonePercent - $status = mStatus($specsArray); - switch ($status) { - case 1: - $template['state'] = "DISPONIBLE"; - break; - case 2: - $template['state'] = "EN COURS"; - foreach ($specsArray[2]->childNodes as $child) { - if (isset($child->tagName)) { - if ($child->tagName == "table") { - $progressBar = $child->childNodes[0]->childNodes[0]; - if ($progressBar->getAttribute('bgcolor') == "Green") { - $template['donePercent'] = substr($progressBar->getAttribute('width'), 0, -1); - } - } - } - } - break; - case 3: - $template['state'] = "TERMINE"; - break; - default : - $template['state'] = "HORS SERVICE"; - break; - } - - if ($template['state'] === "EN COURS") { // We set Times only when they could exist - //Get StartTime - $template['startTime'] = $specsArray[4]->nodeValue; - //Get EndTime - $template['endTime'] = $specsArray[5]->nodeValue; - $template['remainingTime'] = get_remaining_time($template['startTime'], $template['endTime'], $template['donePercent']); - } - - if ($type == 1) { - array_push($return['dryers'], $template); - } elseif ($type == 2) { - array_push($return['washers'], $template); - } - -// echo "
";
-//    echo "Type : ".$type."\n" ;
-//    print_r($template);
-//    echo "
"; - } - $jsonData = json_encode($return); - file_put_contents('washinsa.json', $jsonData); -} - - -/** - * Get remaining time for the current machine - * - * @param $startTime - * @param $endTime - * @return string - */ -function get_remaining_time($startTime, $endTime, $percentDone) -{ - $startArray = explode(':', $startTime); - $endArray = explode(':', $endTime); - - $unixStart = mktime($startArray[0], $startArray[1], 0, 1, 0); - $unixEnd = mktime($endArray[0], $endArray[1], 0, 1, 0); - - if ($unixStart > $unixEnd) { // Machine ends the following day - $unixEnd = mktime($endArray[0], $endArray[1], 0, 1, 1); - } - $deltaMinutes = ($unixEnd - $unixStart)/60; - $remainingTime = $deltaMinutes * (1 - $percentDone/100); - return strval(round($remainingTime, 0)); -} - -generateJson(); diff --git a/washinsa/washinsa_handler.py b/washinsa/washinsa_handler.py new file mode 100644 index 0000000..35e6777 --- /dev/null +++ b/washinsa/washinsa_handler.py @@ -0,0 +1,218 @@ +# Parser made with BeautifulSoup4 +# https://www.crummy.com/software/BeautifulSoup/bs4/doc + +from bs4 import BeautifulSoup +import urllib.request +from enum import Enum, auto +import pprint +import re +import json + +''' +PAGE STRUCTURE +as of june 2020 + +A table with a row (tr html tag) for each machine +Each machine row is composed of 6 columns + - 1 - Type ("SECHE LINGE 14 KG" or "LAVE LINGE 6 KG") + - 2 - Number ("No X" with X the current number) + - 3 - Status (A string showing the status or a table showing the progress) + - 4 - Program (Name of the program or empty) + - 5 - Start time (The start time in format HH:MM or empty) + - 6 - End time (The end time in format HH:MM or empty) +''' + +DUMP_FILE = "washinsa_data.json" +WASHINSA_URL = "https://www.proxiwash.com/weblaverie/component/weblaverie/?view=instancesfiche&format=raw&s=cf4f39" +DRYER_STRING = "SECHE LINGE" + + +class State(Enum): + AVAILABLE = 0 + RUNNING = 1 + RUNNING_NOT_STARTED = 2 + FINISHED = 3 + UNAVAILABLE = 4 + ERROR = 5 + UNKNOWN = 6 + + +# Table used to convert state string given by the page into State enum +STATE_CONVERSION_TABLE = { + "DISPONIBLE": State.AVAILABLE, + "TERMINE": State.FINISHED, + "HORS SERVICE": State.UNAVAILABLE, + "ERREUR": State.ERROR, +} + +TIME_RE = re.compile("^\d\d:\d\d$") + + +def download_page(): + """ + Downloads the page from proxiwash website + """ + try: + with urllib.request.urlopen(WASHINSA_URL) as response: + return response.read().decode() + except: + print("Error processing following url: " + WASHINSA_URL) + return "" + + +def get_rows(soup): + """ + Gets rows corresponding to machines on the page + """ + rows = soup.table.contents + del rows[0] + return rows + + +def is_machine_dryer(row): + """ + Checks if the given machine is a dryer. If it is not, it means it is a washer. + + To check this, we look if the test string is included in the state string + """ + return DRYER_STRING in row.contents[0].text + + +def get_machine_number(row): + """ + Gets the current machine number. + + To find it, we look in the Number cell and remove the "No" prefix to keep only the number. + We do not rely on the index in the list as it could get lost in parsing. + Plus, this method allows for non numeric machine identifiers. + """ + return row.contents[1].text.split()[1] + + +def get_machine_state(row): + """ + Gets the current machine state. + + The state is usually written in plain text inside the State cell. + In this case, we simply check it against the state translation table, + to turn it into a State Enum, which is easier to manipulate. + + In some cases, this plain text is replace by a progress bar. + This the machine is running. + + If the state string cannot be recognized, this returns the unknown state. + """ + content = row.contents[2].contents[0] + state = State.UNKNOWN + if content.name == "table": + state = State.RUNNING + else: + for key in STATE_CONVERSION_TABLE: + if key in content.text: + state = STATE_CONVERSION_TABLE[key] + break + return state + + +def get_machine_program(row): + """ + Gets the machine program as written in plain text in the Program cell + """ + return row.contents[3].text.strip() + + +def get_machine_times(row): + """ + Gets the start and end time for the machine. + + If one of these times is invalid (does not respect the HH:MM format), an empty string is returned + """ + start_time = row.contents[4].text + end_time = row.contents[5].text + if TIME_RE.match(start_time) and TIME_RE.match(end_time): + return start_time, end_time + else: + return "", "" + + +def get_machine_done_percent(row): + """ + Gets the machine done percentage. + + This percent is given in the width property of the first column, in the table inside the State cell. + This only applies if the machine is running. + """ + content = row.contents[2].contents[0] + percent = "" + if content.name == "table": + percent = content.td["width"].replace("%", "") + if float(percent) > 100: # because the website is not doing this check... + percent = '100' + return percent + + +def get_machine_remaining_time(row): + """ + Gets the remaining time in minutes. + + The time is written in the title property of the table inside the State cell. + It is written inside a phrase, so we need to extract it. + This only applies if the machine is running. + """ + content = row.contents[2].contents[0] + time = 0 + if content.name == "table": + time = content["title"].split("=")[1].split()[0] + return time + + +def get_parsed_data(rows): + """ + Gets the parsed data from the web page, farmatting it in a easy to use object + """ + dryers = [] + washers = [] + for row in rows: + state = get_machine_state(row) + machine = { + "number": get_machine_number(row), + "state": state.value, + "startTime": "", + "endTime": "", + "donePercent": "", + "remainingTime": "", + "program": "", + } + if state == State.RUNNING: + machine_times = get_machine_times(row) + machine["startTime"] = machine_times[0] + machine["endTime"] = machine_times[1] + if len(machine_times[0]) == 0: + state = State.RUNNING_NOT_STARTED + machine["state"] = state.value + machine["program"] = get_machine_program(row) + machine["donePercent"] = get_machine_done_percent(row) + machine["remainingTime"] = get_machine_remaining_time(row) + + if is_machine_dryer(row): + dryers.append(machine) + else: + washers.append(machine) + + return { + "dryers": dryers, + "washers": washers + } + + +def main(): + soup = BeautifulSoup(download_page(), 'html.parser') + rows = get_rows(soup) + with open(DUMP_FILE, 'w') as f: + json.dump(get_parsed_data(rows), f) + + +main() + + + diff --git a/washinsa/washinsa_update.sh b/washinsa/washinsa_update.sh index 810a7aa..2f213f2 100755 --- a/washinsa/washinsa_update.sh +++ b/washinsa/washinsa_update.sh @@ -1,5 +1,5 @@ #!/bin/bash touch lock -php index.php +python3 washinsa_handler.py > log 2> err rm lock