From 1b84aa2f66bf9ceeaca6e4fee3b0a1047f028261 Mon Sep 17 00:00:00 2001
From: Matthew Hague <Matthew.Hague@rhul.ac.uk>
Date: Tue, 29 Oct 2024 11:27:57 +0000
Subject: [PATCH] add(timetable): add scrape-timetable script

---
 timetabling/README.md           |   1 +
 timetabling/scrape-timetable.py | 399 ++++++++++++++++++++++++++++++++
 2 files changed, 400 insertions(+)
 create mode 100644 timetabling/scrape-timetable.py

diff --git a/timetabling/README.md b/timetabling/README.md
index 86d3132..8041e03 100644
--- a/timetabling/README.md
+++ b/timetabling/README.md
@@ -3,6 +3,7 @@
 
 * `timetable-ics.py` -- convert the Timetabling team spreadsheets into ICS format. This is a rough and ready script and will probably need tweaks to your particular spreadsheet.
 * `get-practical-groups.py` -- find out which lab groups your students belong to on the web timetable.
+* `scrape-timetable.py` -- pull ICS files from the timetabling system for departments/modules. Needs some config.
 
 To view the generated .ics files, you might like [ttcal][ttcal] (or just import into your favouring calendar app/website).
 
diff --git a/timetabling/scrape-timetable.py b/timetabling/scrape-timetable.py
new file mode 100644
index 0000000..2ef3560
--- /dev/null
+++ b/timetabling/scrape-timetable.py
@@ -0,0 +1,399 @@
+# Scrapes timetable from web timetable system
+#
+# Configure:
+#
+# * ACADEMIC_YEAR -- e.g. 202425
+# * USERNAME -- e.g. uxac009
+# * BASE_DATE -- the start of Term 1 for the academic year
+# * DEPARTMENTS -- which depts to get activities from
+# * MODULES_RE -- filter which modules to get data for
+# * CA_CERTS -- path to CA_CERTS file (needed for me, maybe not for you)
+
+import re
+import requests
+import pytz
+
+from bs4 import BeautifulSoup, Tag
+from datetime import datetime, timedelta
+from dataclasses import dataclass
+from getpass import getpass
+from icalendar import Calendar, Event
+
+from typing import Dict, Generator, List, Optional, Set
+
+ACADEMIC_YEAR = "2425"
+USERNAME = "uxac009"
+TIMEZONE = "Europe/London"
+# Start of Term 1 (Week 1) for academic year
+BASE_DATE = datetime(2024, 9, 23, 0, 0, 0, 0, pytz.timezone(TIMEZONE))
+
+DEPARTMENTS = [
+    "Computer Science",
+    "Mathematics and Information Security Group"
+]
+
+# Match modules to get tt for
+# Match module format e.g CS1811 | Object Oriented Programming I
+MODULES_RE = re.compile("CS.*|IY.*|PC.*|DC.*")
+
+ACTIVITIES = set([
+    "Practical",
+    "Optional_Attendance",
+    "Lecture",
+    "Lecture_-_Online",
+    "Workshop",
+    "Practical_-_Online",
+    "Workshop_-_Online"
+])
+
+# Time delta from midnight
+TT_BASETIME = timedelta(hours=8)
+
+BEDFORD = 100 # not a year
+
+TT_HOST = "webtimetables.royalholloway.ac.uk"
+TT_HOME_URL = f"https://{TT_HOST}/"
+BASE_URL = f"{TT_HOME_URL}/SWS/SDB{ACADEMIC_YEAR}SWS"
+TT_LOGIN_URL = f"{BASE_URL}/Login.aspx"
+TT_DEFAULT_URL = f"{BASE_URL}/default.aspx"
+
+USER_AGENT \
+    = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0"
+
+ONLINE_ACTIVITY_RE = re.compile(".*Online.*", re.IGNORECASE)
+# Locations, all caps, at least one letter
+LOCATION_RE = re.compile(r"^\s*[-A-Z\d\s]*[A-Z][-A-Z\d\s]*\s*$")
+GROUP_RE = re.compile(r"Group (\d+)", re.IGNORECASE)
+GROUP_NUM_GRP = 1
+MODULE_LIST_RE = re.compile(
+    r"\s*([A-Z]{2}[0-9]{4}[A-Z]?(,\s+)?)+\s*", re.IGNORECASE
+)
+MODULE_LIST_SEP = ","
+
+# Because my Python environment doesn't use system certs to verify https
+# unless it's told to using verify=CA_CERTS in code below
+# Set to True to use your default certs
+CA_CERTS = "/etc/ssl/certs/ca-certificates.crt"
+
+HEADERS = {
+    "Host": TT_HOST,
+    "Origin": TT_HOME_URL,
+    "Referer": TT_DEFAULT_URL,
+    "User-Agent": USER_AGENT
+}
+
+MOD_CODE_RE = re.compile(r"[A-Z]{2}\d{4}[A-Z]?")
+STAFF_RE = re.compile(r"[A-Z]*, \w*")
+
+# should be lower
+DAY_STARTS = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]
+# ical format
+CAL_DAYS = [ "MO", "TU", "WE", "TH", "FR", "SA", "SU" ]
+
+@dataclass(frozen=True)
+class Module:
+    mod_code : str
+    tt_mod_id : str
+
+def update_form_data(response, form_data):
+    soup = BeautifulSoup(response.text, "lxml")
+
+    def update_field(field):
+        vs = soup.find(id=field)
+        if vs is not None:
+            form_data[field] = vs.get("value")
+
+    update_field("__VIEWSTATE")
+    update_field("__VIEWSTATEGENERATOR")
+    update_field("__EVENTTARGET")
+    update_field("__EVENTARGUMENT")
+    update_field("__EVENTVALIDATION")
+
+def initialise_form_data(form_data):
+    response = requests.get(
+        TT_HOME_URL,
+        headers=HEADERS,
+        verify=CA_CERTS
+    )
+    update_form_data(response, form_data)
+
+def login(
+    form_data : Dict[str, str], password : str
+) -> requests.cookies.RequestsCookieJar:
+    response = requests.post(
+        TT_LOGIN_URL,
+        data = form_data | {
+            "tUserName": USERNAME,
+            "tPassword": password,
+            "bLogin": "Login"
+        },
+        headers=HEADERS,
+        allow_redirects=False,
+        verify=CA_CERTS
+    )
+    return response.cookies
+
+def make_request(
+    form_data : Dict[str, str], cookies : requests.cookies.RequestsCookieJar
+):
+    response = requests.post(
+        TT_DEFAULT_URL,
+        data=form_data,
+        cookies=cookies,
+        headers=HEADERS,
+        verify=CA_CERTS
+    )
+    update_form_data(response, form_data)
+    return response.text
+
+def get_weeks(weeks : str) -> Generator[int, None, None]:
+    for period in weeks.split(","):
+        bounds = period.strip().split("-")
+        if len(bounds) == 1:
+            yield int(bounds[0])
+        elif len(bounds) == 2:
+            for week in range(int(bounds[0]), int(bounds[1]) + 1):
+                yield week
+        else:
+            raise ValueError("Unknown weeks pattern {period}")
+
+def get_time(week : int, day_offset : int, act_time : timedelta) -> datetime:
+    day = BASE_DATE + timedelta(days=7 * (week - 1) + day_offset)
+    return day + act_time
+
+def is_row_label(cell : Tag) -> bool:
+    if "class" not in cell.attrs:
+        return False
+    for cl in cell.attrs["class"]:
+        if cl.lower().startswith("row-label"):
+            return True
+    return False
+
+def get_activity(cell : Tag) -> Optional[str]:
+    if "class" in cell.attrs:
+        acts = ACTIVITIES & set(cell["class"])
+        if len(acts) > 0:
+            return next(iter(acts))
+    return None
+
+def is_location(cell : str) -> bool:
+    return LOCATION_RE.match(cell) is not None
+
+def is_online(activity : str) -> bool:
+    return ONLINE_ACTIVITY_RE.match(activity) is not None
+
+def get_day_index(day : str) -> int:
+    lower_day = day.lower()
+    for (idx, day_start) in enumerate(DAY_STARTS):
+        if lower_day.startswith(day_start):
+            return idx
+    return -1
+
+def parse_timetable(
+    mod_code : str,
+    timetable : str,
+    cals : Dict[int, Calendar]
+):
+    soup = BeautifulSoup(timetable, "lxml")
+    for ttable in soup.body.findChildren("table", recursive=False):
+        if "class" not in ttable.attrs:
+            continue
+        table_type = ttable["class"]
+        if "grid-border-args" in table_type:
+            tt_rows = ttable.findChildren("tr", recursive=False)
+            weeks = ""
+            day_idx = 0 # Monday
+            for (row_idx, tt_row) in enumerate(tt_rows):
+                if row_idx == 1:
+                    weeks = tt_row.find("td", { "class": "row-label-two" }).text
+
+                day_label = tt_row.find("td", { "class": "row-label-one" })
+                if day_label:
+                    day_idx = get_day_index(day_label.text)
+
+                cell_pos = 0
+                for cell in tt_row.findChildren("td", recursive=False):
+                    colspan = int(cell.attrs.get("colspan", "1"))
+
+                    activity = get_activity(cell)
+                    if activity:
+                        activity_time = TT_BASETIME \
+                            + cell_pos * timedelta(minutes=30)
+                        group = ""
+                        module_list = [mod_code]
+                        location = "ONLINE" if is_online(activity) else ""
+                        length = colspan * timedelta(minutes=30)
+                        description = re.sub(r"\n+", "\n", cell.text)
+
+                        for cell in cell.find_all("td"):
+                            if m := GROUP_RE.match(cell.text):
+                                group = m[GROUP_NUM_GRP]
+                            elif MODULE_LIST_RE.fullmatch(cell.text):
+                                module_list = [
+                                    mod_code.strip()
+                                    for mod_code in cell.text.split(
+                                        MODULE_LIST_SEP
+                                    )
+                                ]
+                            elif is_location(cell.text):
+                                location = cell.text
+
+                        tt_weeks = list(get_weeks(weeks))
+                        start_time = get_time(
+                            tt_weeks[0], day_idx, activity_time
+                        )
+                        end_time = start_time + length
+
+                        year_weeks = list(map(
+                            lambda w: get_time(
+                                w, day_idx, activity_time
+                            ).isocalendar()[1],
+                            tt_weeks
+                        ))
+
+                        event = Event()
+                        mod_codes = "/".join(sorted(module_list))
+                        name = f"{mod_codes} {activity} ({group})"
+                        event.add("summary", name)
+                        event.add("description", description)
+                        event.add("location", location)
+                        event.add("dtstart", start_time)
+                        event.add("dtend", end_time)
+                        event.add("rrule", {
+                            "freq": "YEARLY",
+                            "count": len(tt_weeks),
+                            "byweekno": year_weeks,
+                            "byday": CAL_DAYS[day_idx]
+                        })
+
+                        for year in get_years(module_list):
+                            # HACK: do this more efficiently and will
+                            # better duplicate detection
+                            if event not in cals[year].walk():
+                                cals[year].add_component(event)
+
+                    if not is_row_label(cell):
+                        cell_pos += colspan
+
+def get_years(mod_codes : List[str]) -> Set[int]:
+    return set(int(name[2]) for name in mod_codes)
+
+def get_department_page(
+    cookies : requests.cookies.RequestsCookieJar,
+    department : str,
+    form_data : Dict[str, str]
+) -> BeautifulSoup:
+    """Returns a soup of the department page
+    Updates form data for the next request"""
+    form_data.update({ "__EVENTTARGET": "LinkBtn_module" })
+    module_page = make_request(form_data, cookies)
+
+    soup = BeautifulSoup(module_page, "lxml")
+    dept_id : str = soup.find(id="dlFilter2") \
+        .find("option", string=department) \
+        .get("value")
+
+    # select the department from the departments list (loads module list)
+    form_data.update({
+        "__EVENTTARGET": "dlFilter2",
+        "tLinkType": "module",
+        "dlFilter2": dept_id
+    })
+
+    dept_page = make_request(form_data, cookies)
+
+    return BeautifulSoup(dept_page, "lxml")
+
+
+def get_department_modules(
+    cookies : requests.cookies.RequestsCookieJar,
+    init_form_data : Dict[str, str],
+    department : str,
+) -> Set[Module]:
+    """Gets list of module IDs for department"""
+    form_data = dict(init_form_data)
+    soup = get_department_page(cookies, department, form_data)
+    return set(
+        Module(
+            # parse e.g. CS1811 | Object Oriented Programming I
+            option.text.split(" ")[0],
+            option.get("value")
+        )
+        for option in soup.find(id="dlObject").find_all("option")
+        if MODULES_RE.fullmatch(option.text)
+    )
+
+def get_module_timetable(
+    cookies : requests.cookies.RequestsCookieJar,
+    init_form_data : Dict[str, str],
+    department : str,
+    module : Module,
+    cals : Dict[int, Calendar]
+):
+    """Get the timetable for the module with the given id
+    The id should come from the timetabling system, not the module code.
+    See get_department_modules"""
+    form_data = dict(init_form_data)
+    soup = get_department_page(cookies, department, form_data)
+
+    weeks = None
+    for option in soup.find(id="lbWeeks").find_all("option"):
+        val_weeks = option.get("value")
+        val_name = option.text
+        if "all" in val_name.lower():
+            weeks = val_weeks
+            break
+
+    if weeks is not None:
+        form_data.update({
+            "dlObject": module.tt_mod_id,
+            "lbWeeks": weeks,
+            "lbDays": "1-5",
+            "dlPeriod": "1-28",
+            "RadioType": "individual;swsurl;swsurl",
+            "bGetTimetable": "View Timetable",
+        })
+
+        timetable = make_request(form_data, cookies)
+
+        parse_timetable(module.mod_code, timetable, cals)
+
+def get_timetables(password : str):
+    cals = { year: Calendar() for year in range(0, 6) }
+    cals.update({ BEDFORD : Calendar() })
+
+    form_data = dict()
+    initialise_form_data(form_data)
+    cookies = login(form_data, password)
+
+    # need to refresh form data with a blank request
+    make_request(form_data, cookies)
+
+    initial_form_data = dict(form_data)
+
+    for department in DEPARTMENTS:
+        modules = get_department_modules(
+            cookies, initial_form_data, department
+        )
+        for module in modules:
+            print("Getting", module.mod_code)
+            get_module_timetable(
+                cookies, initial_form_data, department, module, cals
+            )
+
+    return cals
+
+def save_cals(cals : Dict[int, Calendar]):
+    for year in cals:
+        if year != BEDFORD:
+            with open(f"Year{year}.ics", 'wb') as icsfile:
+                icsfile.write(cals[year].to_ical())
+
+    with open(f"Bedford.ics", 'wb') as icsfile:
+        icsfile.write(cals[BEDFORD].to_ical())
+
+password = getpass(f"Password for {USERNAME}: ")
+cals = get_timetables(password)
+save_cals(cals)
+
+
-- 
GitLab