application-amicale-serveur/washinsa/washinsa_handler.py

# Parser made with BeautifulSoup4
# https://www.crummy.com/software/BeautifulSoup/bs4/doc

from bs4 import BeautifulSoup
import urllib.request
from enum import Enum, auto
import pprint
import re
import json

'''
PAGE STRUCTURE
as of june 2020

A table with a row (tr html tag) for each machine
Each machine row is composed of 6 columns
 - 1 - Type ("SECHE LINGE 14 KG" or "LAVE LINGE 6 KG")
 - 2 - Number ("No X" with X the current number)
 - 3 - Status (A string showing the status or a table showing the progress)
 - 4 - Program (Name of the program or empty)
 - 5 - Start time (The start time in format HH:MM or empty)
 - 6 - End time (The end time in format HH:MM or empty)
'''

DUMP_FILE = "washinsa_data.json"
WASHINSA_URL = "https://www.proxiwash.com/weblaverie/component/weblaverie/?view=instancesfiche&format=raw&s=cf4f39"
DRYER_STRING = "SECHE LINGE"


class State(Enum):
    AVAILABLE = 0
    RUNNING = 1
    RUNNING_NOT_STARTED = 2
    FINISHED = 3
    UNAVAILABLE = 4
    ERROR = 5
    UNKNOWN = 6


# Table used to convert state string given by the page into State enum
STATE_CONVERSION_TABLE = {
    "DISPONIBLE": State.AVAILABLE,
    "TERMINE": State.FINISHED,
    "HORS SERVICE": State.UNAVAILABLE,
    "ERREUR": State.ERROR,
}

TIME_RE = re.compile("^\d\d:\d\d$")


def download_page():
    """
    Downloads the page from proxiwash website
    """
    try:
        with urllib.request.urlopen(WASHINSA_URL) as response:
            return response.read().decode()
    except:
        print("Error processing following url: " + WASHINSA_URL)
        return ""


def get_rows(soup):
    """
    Gets rows corresponding to machines on the page
    """
    rows = soup.table.contents
    del rows[0]
    return rows


def is_machine_dryer(row):
    """
    Checks if the given machine is a dryer. If it is not, it means it is a washer.

    To check this, we look if the test string is included in the state string
    """
    return DRYER_STRING in row.contents[0].text


def get_machine_number(row):
    """
    Gets the current machine number.

    To find it, we look in the Number cell and remove the "No" prefix to keep only the number.
    We do not rely on the index in the list as it could get lost in parsing.
    Plus, this method allows for non numeric machine identifiers.
    """
    return row.contents[1].text.split()[1]


def get_machine_state(row):
    """
    Gets the current machine state.

    The state is usually written in plain text inside the State cell.
    In this case, we simply check it against the state translation table,
    to turn it into a State Enum, which is easier to manipulate.

    In some cases, this plain text is replace by a progress bar.
    This the machine is running.

    If the state string cannot be recognized, this returns the unknown state.
    """
    content = row.contents[2].contents[0]
    state = State.UNKNOWN
    if content.name == "table":
        state = State.RUNNING
    else:
        for key in STATE_CONVERSION_TABLE:
            if key in content.text:
                state = STATE_CONVERSION_TABLE[key]
                break
    return state


def get_machine_program(row):
    """
    Gets the machine program as written in plain text in the Program cell
    """
    return row.contents[3].text.strip()


def get_machine_times(row):
    """
    Gets the start and end time for the machine.

    If one of these times is invalid (does not respect the HH:MM format), an empty string is returned
    """
    start_time = row.contents[4].text
    end_time = row.contents[5].text
    if TIME_RE.match(start_time) and TIME_RE.match(end_time):
        return start_time, end_time
    else:
        return "", ""


def get_machine_done_percent(row):
    """
    Gets the machine done percentage.

    This percent is given in the width property of the first column, in the table inside the State cell.
    This only applies if the machine is running.
    """
    content = row.contents[2].contents[0]
    percent = ""
    if content.name == "table":
        percent = content.td["width"].replace("%", "")
        if float(percent) > 100:  # because the website is not doing this check...
            percent = '100'
    return percent


def get_machine_remaining_time(row):
    """
    Gets the remaining time in minutes.

    The time is written in the title property of the table inside the State cell.
    It is written inside a phrase, so we need to extract it.
    This only applies if the machine is running.
    """
    content = row.contents[2].contents[0]
    time = 0
    if content.name == "table":
        time = content["title"].split("=")[1].split()[0]
    return time


def get_parsed_data(rows):
    """
    Gets the parsed data from the web page, farmatting it in a easy to use object
    """
    dryers = []
    washers = []
    for row in rows:
        state = get_machine_state(row)
        machine = {
            "number": get_machine_number(row),
            "state": state.value,
            "startTime": "",
            "endTime": "",
            "donePercent": "",
            "remainingTime": "",
            "program": "",
        }
        if state == State.RUNNING:
            machine_times = get_machine_times(row)
            machine["startTime"] = machine_times[0]
            machine["endTime"] = machine_times[1]
            if len(machine_times[0]) == 0:
                state = State.RUNNING_NOT_STARTED
                machine["state"] = state.value
            machine["program"] = get_machine_program(row)
            machine["donePercent"] = get_machine_done_percent(row)
            machine["remainingTime"] = get_machine_remaining_time(row)

        if is_machine_dryer(row):
            dryers.append(machine)
        else:
            washers.append(machine)

    return {
        "dryers": dryers,
        "washers": washers
    }


def main():
    soup = BeautifulSoup(download_page(), 'html.parser')
    rows = get_rows(soup)
    with open(DUMP_FILE, 'w') as f:
        json.dump(get_parsed_data(rows), f)


main()