Implemented washinsa parser in python
This commit is contained in:
		
							parent
							
								
									4cbc13dd01
								
							
						
					
					
						commit
						6a072cb4e2
					
				
					 8 changed files with 321 additions and 181 deletions
				
			
		
							
								
								
									
										102
									
								
								__examples__/washinsa.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										102
									
								
								__examples__/washinsa.json
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,102 @@ | |||
| { | ||||
|   "dryers": [ | ||||
|     { | ||||
|       "number": "1", | ||||
|       "state": "DISPONIBLE", | ||||
|       "startTime": "", | ||||
|       "endTime": "", | ||||
|       "donePercent": "", | ||||
|       "remainingTime": "" | ||||
|     }, | ||||
|     { | ||||
|       "number": "2", | ||||
|       "state": "EN COURS", | ||||
|       "startTime": "15:24", | ||||
|       "endTime": "15:59", | ||||
|       "donePercent": "62.9", | ||||
|       "remainingTime": "13" | ||||
|     }, | ||||
|     { | ||||
|       "number": "3", | ||||
|       "state": "TERMINE", | ||||
|       "startTime": "", | ||||
|       "endTime": "", | ||||
|       "donePercent": "", | ||||
|       "remainingTime": "" | ||||
|     } | ||||
|   ], | ||||
|   "washers": [ | ||||
|     { | ||||
|       "number": "4", | ||||
|       "state": "EN COURS", | ||||
|       "startTime": "\u00a0", | ||||
|       "endTime": "\u00a0", | ||||
|       "donePercent": "", | ||||
|       "remainingTime": "0" | ||||
|     }, | ||||
|     { | ||||
|       "number": "5", | ||||
|       "state": "EN COURS", | ||||
|       "startTime": "15:35", | ||||
|       "endTime": "16:15", | ||||
|       "donePercent": "26.9", | ||||
|       "remainingTime": "29" | ||||
|     }, | ||||
|     { | ||||
|       "number": "6", | ||||
|       "state": "DISPONIBLE", | ||||
|       "startTime": "", | ||||
|       "endTime": "", | ||||
|       "donePercent": "", | ||||
|       "remainingTime": "" | ||||
|     }, | ||||
|     { | ||||
|       "number": "7", | ||||
|       "state": "EN COURS", | ||||
|       "startTime": "15:34", | ||||
|       "endTime": "16:19", | ||||
|       "donePercent": "26", | ||||
|       "remainingTime": "33" | ||||
|     }, | ||||
|     { | ||||
|       "number": "8", | ||||
|       "state": "DISPONIBLE", | ||||
|       "startTime": "", | ||||
|       "endTime": "", | ||||
|       "donePercent": "", | ||||
|       "remainingTime": "" | ||||
|     }, | ||||
|     { | ||||
|       "number": "9", | ||||
|       "state": "DISPONIBLE", | ||||
|       "startTime": "", | ||||
|       "endTime": "", | ||||
|       "donePercent": "", | ||||
|       "remainingTime": "" | ||||
|     }, | ||||
|     { | ||||
|       "number": "10", | ||||
|       "state": "DISPONIBLE", | ||||
|       "startTime": "", | ||||
|       "endTime": "", | ||||
|       "donePercent": "", | ||||
|       "remainingTime": "" | ||||
|     }, | ||||
|     { | ||||
|       "number": "11", | ||||
|       "state": "DISPONIBLE", | ||||
|       "startTime": "", | ||||
|       "endTime": "", | ||||
|       "donePercent": "", | ||||
|       "remainingTime": "" | ||||
|     }, | ||||
|     { | ||||
|       "number": "12", | ||||
|       "state": "EN COURS", | ||||
|       "startTime": "15:02", | ||||
|       "endTime": "15:47", | ||||
|       "donePercent": "97.7", | ||||
|       "remainingTime": "1" | ||||
|     } | ||||
|   ] | ||||
| } | ||||
|  | @ -1 +0,0 @@ | |||
| {"dryers":[{"number":"1","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"2","state":"EN COURS","startTime":"15:24","endTime":"15:59","donePercent":"62.9","remainingTime":"13"},{"number":"3","state":"TERMINE","startTime":"","endTime":"","donePercent":"","remainingTime":""}],"washers":[{"number":"4","state":"EN COURS","startTime":"\u00a0","endTime":"\u00a0","donePercent":"","remainingTime":"0"},{"number":"5","state":"EN COURS","startTime":"15:35","endTime":"16:15","donePercent":"26.9","remainingTime":"29"},{"number":"6","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"7","state":"EN COURS","startTime":"15:34","endTime":"16:19","donePercent":"26","remainingTime":"33"},{"number":"8","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"9","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"10","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"11","state":"DISPONIBLE","startTime":"","endTime":"","donePercent":"","remainingTime":""},{"number":"12","state":"EN COURS","startTime":"15:02","endTime":"15:47","donePercent":"97.7","remainingTime":"1"}]} | ||||
|  | @ -1,179 +0,0 @@ | |||
| <?php | ||||
| /** | ||||
|  * Wash-INSA PARSER | ||||
|  * This program get all information about the laundry | ||||
|  * Author : Gabin NOBLET (Promo 55) | ||||
|  * Date : 07/2019 | ||||
|  * Version 1.0 | ||||
|  */ | ||||
| 
 | ||||
| 
 | ||||
| /**Get Type of machines (1 for dryers, 2 for washers, 0 else) | ||||
|  * @param $str (string) | ||||
|  * @return int | ||||
|  */ | ||||
| function mType($str) | ||||
| { | ||||
|     $a = explode(' ', $str); | ||||
|     if (substr($a[0], -5) == "SECHE") { | ||||
|         return 1; | ||||
|     } elseif (substr($a[0], -4) == "LAVE") { | ||||
|         return 2; | ||||
|     } else { | ||||
|         return 0; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /**Get Status of machines (1 for available, 2 for current, 3 for finished, 0 for out of order) | ||||
|  * @param $node (DOMNode) | ||||
|  * @return int | ||||
|  */ | ||||
| function mStatus($node) | ||||
| { | ||||
|     $a = explode(' ', $node[2]->nodeValue); | ||||
|     if (substr($a[0], -10) == "DISPONIBLE") { | ||||
|         return 1; | ||||
|     } elseif (substr($a[0], -7) == "TERMINE") { | ||||
|         return 3; | ||||
|     } elseif (isset($node[4])) {   //If there is time information, it is current
 | ||||
|         return 2; | ||||
|     } else { | ||||
|         return 0; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /** Clean strings (removes spaces, tabs end return before and behind the string) | ||||
|  * @param $str (string) | ||||
|  * @return string | ||||
|  */ | ||||
| function clean($str) | ||||
| { | ||||
|     return rtrim(ltrim(htmlspecialchars_decode($str))); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| function generateJson() | ||||
| { | ||||
|     $page = new DOMDocument(); | ||||
|     $page->loadHTMLFile("https://www.proxiwash.com/weblaverie/ma-laverie-2?s=cf4f39&16d33a57b3fb9a05d4da88969c71de74=1"); //Get page
 | ||||
| 
 | ||||
| 
 | ||||
|     $division = $page->getElementById("liste-machines"); | ||||
|     $liste_machines = $division->childNodes[0]->childNodes; | ||||
| 
 | ||||
| //Remove titles
 | ||||
|     $titles = $liste_machines->item(0); | ||||
|     $titles->parentNode->removeChild($titles); | ||||
| 
 | ||||
| 
 | ||||
|     $return = array( | ||||
|         'dryers' => array(), | ||||
|         'washers' => array() | ||||
|     ); | ||||
| 
 | ||||
|     foreach ($liste_machines as $machines) { | ||||
| 
 | ||||
|         $specs = $machines->childNodes; | ||||
| 
 | ||||
|         //Format array
 | ||||
|         $template = array( | ||||
|             'number' => "", | ||||
|             'state' => "", | ||||
|             'startTime' => "", | ||||
|             'endTime' => "", | ||||
|             'donePercent' => "", | ||||
|             'remainingTime' => "" | ||||
|         ); | ||||
| 
 | ||||
|         //Get info from HTML
 | ||||
|         $specsArray = array(); | ||||
|         foreach ($specs as $info) { | ||||
|             if (isset($info->tagName)) { | ||||
|                 if ($info->tagName == "td") { | ||||
|                     array_push($specsArray, $info); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         //Get Type
 | ||||
|         $type = mType(clean($specsArray[0]->nodeValue)); | ||||
| 
 | ||||
|         //Get Number
 | ||||
|         preg_match('/\d{1,2}/', $specsArray[1]->nodeValue, $number); | ||||
|         $template['number'] = $number[0]; | ||||
| 
 | ||||
|         //Get Status and DonePercent
 | ||||
|         $status = mStatus($specsArray); | ||||
|         switch ($status) { | ||||
|             case 1: | ||||
|                 $template['state'] = "DISPONIBLE"; | ||||
|                 break; | ||||
|             case 2: | ||||
|                 $template['state'] = "EN COURS"; | ||||
|                 foreach ($specsArray[2]->childNodes as $child) { | ||||
|                     if (isset($child->tagName)) { | ||||
|                         if ($child->tagName == "table") { | ||||
|                             $progressBar = $child->childNodes[0]->childNodes[0]; | ||||
|                             if ($progressBar->getAttribute('bgcolor') == "Green") { | ||||
|                                 $template['donePercent'] = substr($progressBar->getAttribute('width'), 0, -1); | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 break; | ||||
|             case 3: | ||||
|                 $template['state'] = "TERMINE"; | ||||
|                 break; | ||||
|             default : | ||||
|                 $template['state'] = "HORS SERVICE"; | ||||
|                 break; | ||||
|         } | ||||
| 
 | ||||
|         if ($template['state'] === "EN COURS") { // We set Times only when they could exist
 | ||||
|             //Get StartTime
 | ||||
|             $template['startTime'] = $specsArray[4]->nodeValue; | ||||
|             //Get EndTime
 | ||||
|             $template['endTime'] = $specsArray[5]->nodeValue; | ||||
|             $template['remainingTime'] = get_remaining_time($template['startTime'], $template['endTime'], $template['donePercent']); | ||||
|         } | ||||
| 
 | ||||
|         if ($type == 1) { | ||||
|             array_push($return['dryers'], $template); | ||||
|         } elseif ($type == 2) { | ||||
|             array_push($return['washers'], $template); | ||||
|         } | ||||
| 
 | ||||
| //    echo "<pre>";
 | ||||
| //    echo "Type : ".$type."\n" ;
 | ||||
| //    print_r($template);
 | ||||
| //    echo "</pre>";
 | ||||
|     } | ||||
|     $jsonData = json_encode($return); | ||||
|     file_put_contents('washinsa.json', $jsonData); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /** | ||||
|  * Get remaining time for the current machine | ||||
|  * | ||||
|  * @param $startTime | ||||
|  * @param $endTime | ||||
|  * @return string | ||||
|  */ | ||||
| function get_remaining_time($startTime, $endTime, $percentDone) | ||||
| { | ||||
|     $startArray = explode(':', $startTime); | ||||
|     $endArray = explode(':', $endTime); | ||||
| 
 | ||||
|     $unixStart = mktime($startArray[0], $startArray[1], 0, 1, 0); | ||||
|     $unixEnd = mktime($endArray[0], $endArray[1], 0, 1, 0); | ||||
| 
 | ||||
|     if ($unixStart > $unixEnd) { // Machine ends the following day
 | ||||
|         $unixEnd = mktime($endArray[0], $endArray[1], 0, 1, 1); | ||||
|     } | ||||
|     $deltaMinutes = ($unixEnd - $unixStart)/60; | ||||
|     $remainingTime = $deltaMinutes * (1 - $percentDone/100); | ||||
|     return strval(round($remainingTime, 0)); | ||||
| } | ||||
| 
 | ||||
| generateJson(); | ||||
							
								
								
									
										218
									
								
								washinsa/washinsa_handler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										218
									
								
								washinsa/washinsa_handler.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,218 @@ | |||
| # Parser made with BeautifulSoup4 | ||||
| # https://www.crummy.com/software/BeautifulSoup/bs4/doc | ||||
| 
 | ||||
| from bs4 import BeautifulSoup | ||||
| import urllib.request | ||||
| from enum import Enum, auto | ||||
| import pprint | ||||
| import re | ||||
| import json | ||||
| 
 | ||||
| ''' | ||||
| PAGE STRUCTURE | ||||
| as of june 2020 | ||||
| 
 | ||||
| A table with a row (tr html tag) for each machine | ||||
| Each machine row is composed of 6 columns | ||||
|  - 1 - Type ("SECHE LINGE 14 KG" or "LAVE LINGE 6 KG") | ||||
|  - 2 - Number ("No X" with X the current number) | ||||
|  - 3 - Status (A string showing the status or a table showing the progress) | ||||
|  - 4 - Program (Name of the program or empty) | ||||
|  - 5 - Start time (The start time in format HH:MM or empty) | ||||
|  - 6 - End time (The end time in format HH:MM or empty) | ||||
| ''' | ||||
| 
 | ||||
| DUMP_FILE = "washinsa_data.json" | ||||
| WASHINSA_URL = "https://www.proxiwash.com/weblaverie/component/weblaverie/?view=instancesfiche&format=raw&s=cf4f39" | ||||
| DRYER_STRING = "SECHE LINGE" | ||||
| 
 | ||||
| 
 | ||||
| class State(Enum): | ||||
|     AVAILABLE = 0 | ||||
|     RUNNING = 1 | ||||
|     RUNNING_NOT_STARTED = 2 | ||||
|     FINISHED = 3 | ||||
|     UNAVAILABLE = 4 | ||||
|     ERROR = 5 | ||||
|     UNKNOWN = 6 | ||||
| 
 | ||||
| 
 | ||||
| # Table used to convert state string given by the page into State enum | ||||
| STATE_CONVERSION_TABLE = { | ||||
|     "DISPONIBLE": State.AVAILABLE, | ||||
|     "TERMINE": State.FINISHED, | ||||
|     "HORS SERVICE": State.UNAVAILABLE, | ||||
|     "ERREUR": State.ERROR, | ||||
| } | ||||
| 
 | ||||
| TIME_RE = re.compile("^\d\d:\d\d$") | ||||
| 
 | ||||
| 
 | ||||
| def download_page(): | ||||
|     """ | ||||
|     Downloads the page from proxiwash website | ||||
|     """ | ||||
|     try: | ||||
|         with urllib.request.urlopen(WASHINSA_URL) as response: | ||||
|             return response.read().decode() | ||||
|     except: | ||||
|         print("Error processing following url: " + WASHINSA_URL) | ||||
|         return "" | ||||
| 
 | ||||
| 
 | ||||
| def get_rows(soup): | ||||
|     """ | ||||
|     Gets rows corresponding to machines on the page | ||||
|     """ | ||||
|     rows = soup.table.contents | ||||
|     del rows[0] | ||||
|     return rows | ||||
| 
 | ||||
| 
 | ||||
| def is_machine_dryer(row): | ||||
|     """ | ||||
|     Checks if the given machine is a dryer. If it is not, it means it is a washer. | ||||
| 
 | ||||
|     To check this, we look if the test string is included in the state string | ||||
|     """ | ||||
|     return DRYER_STRING in row.contents[0].text | ||||
| 
 | ||||
| 
 | ||||
| def get_machine_number(row): | ||||
|     """ | ||||
|     Gets the current machine number. | ||||
| 
 | ||||
|     To find it, we look in the Number cell and remove the "No" prefix to keep only the number. | ||||
|     We do not rely on the index in the list as it could get lost in parsing. | ||||
|     Plus, this method allows for non numeric machine identifiers. | ||||
|     """ | ||||
|     return row.contents[1].text.split()[1] | ||||
| 
 | ||||
| 
 | ||||
| def get_machine_state(row): | ||||
|     """ | ||||
|     Gets the current machine state. | ||||
| 
 | ||||
|     The state is usually written in plain text inside the State cell. | ||||
|     In this case, we simply check it against the state translation table, | ||||
|     to turn it into a State Enum, which is easier to manipulate. | ||||
| 
 | ||||
|     In some cases, this plain text is replace by a progress bar. | ||||
|     This the machine is running. | ||||
| 
 | ||||
|     If the state string cannot be recognized, this returns the unknown state. | ||||
|     """ | ||||
|     content = row.contents[2].contents[0] | ||||
|     state = State.UNKNOWN | ||||
|     if content.name == "table": | ||||
|         state = State.RUNNING | ||||
|     else: | ||||
|         for key in STATE_CONVERSION_TABLE: | ||||
|             if key in content.text: | ||||
|                 state = STATE_CONVERSION_TABLE[key] | ||||
|                 break | ||||
|     return state | ||||
| 
 | ||||
| 
 | ||||
| def get_machine_program(row): | ||||
|     """ | ||||
|     Gets the machine program as written in plain text in the Program cell | ||||
|     """ | ||||
|     return row.contents[3].text.strip() | ||||
| 
 | ||||
| 
 | ||||
| def get_machine_times(row): | ||||
|     """ | ||||
|     Gets the start and end time for the machine. | ||||
| 
 | ||||
|     If one of these times is invalid (does not respect the HH:MM format), an empty string is returned | ||||
|     """ | ||||
|     start_time = row.contents[4].text | ||||
|     end_time = row.contents[5].text | ||||
|     if TIME_RE.match(start_time) and TIME_RE.match(end_time): | ||||
|         return start_time, end_time | ||||
|     else: | ||||
|         return "", "" | ||||
| 
 | ||||
| 
 | ||||
| def get_machine_done_percent(row): | ||||
|     """ | ||||
|     Gets the machine done percentage. | ||||
| 
 | ||||
|     This percent is given in the width property of the first column, in the table inside the State cell. | ||||
|     This only applies if the machine is running. | ||||
|     """ | ||||
|     content = row.contents[2].contents[0] | ||||
|     percent = "" | ||||
|     if content.name == "table": | ||||
|         percent = content.td["width"].replace("%", "") | ||||
|         if float(percent) > 100:  # because the website is not doing this check... | ||||
|             percent = '100' | ||||
|     return percent | ||||
| 
 | ||||
| 
 | ||||
| def get_machine_remaining_time(row): | ||||
|     """ | ||||
|     Gets the remaining time in minutes. | ||||
| 
 | ||||
|     The time is written in the title property of the table inside the State cell. | ||||
|     It is written inside a phrase, so we need to extract it. | ||||
|     This only applies if the machine is running. | ||||
|     """ | ||||
|     content = row.contents[2].contents[0] | ||||
|     time = 0 | ||||
|     if content.name == "table": | ||||
|         time = content["title"].split("=")[1].split()[0] | ||||
|     return time | ||||
| 
 | ||||
| 
 | ||||
| def get_parsed_data(rows): | ||||
|     """ | ||||
|     Gets the parsed data from the web page, farmatting it in a easy to use object | ||||
|     """ | ||||
|     dryers = [] | ||||
|     washers = [] | ||||
|     for row in rows: | ||||
|         state = get_machine_state(row) | ||||
|         machine = { | ||||
|             "number": get_machine_number(row), | ||||
|             "state": state.value, | ||||
|             "startTime": "", | ||||
|             "endTime": "", | ||||
|             "donePercent": "", | ||||
|             "remainingTime": "", | ||||
|             "program": "", | ||||
|         } | ||||
|         if state == State.RUNNING: | ||||
|             machine_times = get_machine_times(row) | ||||
|             machine["startTime"] = machine_times[0] | ||||
|             machine["endTime"] = machine_times[1] | ||||
|             if len(machine_times[0]) == 0: | ||||
|                 state = State.RUNNING_NOT_STARTED | ||||
|                 machine["state"] = state.value | ||||
|             machine["program"] = get_machine_program(row) | ||||
|             machine["donePercent"] = get_machine_done_percent(row) | ||||
|             machine["remainingTime"] = get_machine_remaining_time(row) | ||||
| 
 | ||||
|         if is_machine_dryer(row): | ||||
|             dryers.append(machine) | ||||
|         else: | ||||
|             washers.append(machine) | ||||
| 
 | ||||
|     return { | ||||
|         "dryers": dryers, | ||||
|         "washers": washers | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def main(): | ||||
|     soup = BeautifulSoup(download_page(), 'html.parser') | ||||
|     rows = get_rows(soup) | ||||
|     with open(DUMP_FILE, 'w') as f: | ||||
|         json.dump(get_parsed_data(rows), f) | ||||
| 
 | ||||
| 
 | ||||
| main() | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|  | @ -1,5 +1,5 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| touch lock | ||||
| php index.php | ||||
| python3 washinsa_handler.py > log 2> err | ||||
| rm lock | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue