diff --git a/timetabling/README.md b/timetabling/README.md index 86d3132cbdfeacd89669e154b0b3cd97c87d556a..8041e03160fcfadc5f00f196b9ed7078bd04c034 100644 --- a/timetabling/README.md +++ b/timetabling/README.md @@ -3,6 +3,7 @@ * `timetable-ics.py` -- convert the Timetabling team spreadsheets into ICS format. This is a rough and ready script and will probably need tweaks to your particular spreadsheet. * `get-practical-groups.py` -- find out which lab groups your students belong to on the web timetable. +* `scrape-timetable.py` -- pull ICS files from the timetabling system for departments/modules. Needs some config. To view the generated .ics files, you might like [ttcal][ttcal] (or just import into your favouring calendar app/website). diff --git a/timetabling/scrape-timetable.py b/timetabling/scrape-timetable.py new file mode 100644 index 0000000000000000000000000000000000000000..2ef3560012fb85f27e5a3fce52e9f081d949f0c7 --- /dev/null +++ b/timetabling/scrape-timetable.py @@ -0,0 +1,399 @@ +# Scrapes timetable from web timetable system +# +# Configure: +# +# * ACADEMIC_YEAR -- e.g. 202425 +# * USERNAME -- e.g. uxac009 +# * BASE_DATE -- the start of Term 1 for the academic year +# * DEPARTMENTS -- which depts to get activities from +# * MODULES_RE -- filter which modules to get data for +# * CA_CERTS -- path to CA_CERTS file (needed for me, maybe not for you) + +import re +import requests +import pytz + +from bs4 import BeautifulSoup, Tag +from datetime import datetime, timedelta +from dataclasses import dataclass +from getpass import getpass +from icalendar import Calendar, Event + +from typing import Dict, Generator, List, Optional, Set + +ACADEMIC_YEAR = "2425" +USERNAME = "uxac009" +TIMEZONE = "Europe/London" +# Start of Term 1 (Week 1) for academic year +BASE_DATE = datetime(2024, 9, 23, 0, 0, 0, 0, pytz.timezone(TIMEZONE)) + +DEPARTMENTS = [ + "Computer Science", + "Mathematics and Information Security Group" +] + +# Match modules to get tt for +# Match module format e.g CS1811 | Object Oriented Programming I +MODULES_RE = re.compile("CS.*|IY.*|PC.*|DC.*") + +ACTIVITIES = set([ + "Practical", + "Optional_Attendance", + "Lecture", + "Lecture_-_Online", + "Workshop", + "Practical_-_Online", + "Workshop_-_Online" +]) + +# Time delta from midnight +TT_BASETIME = timedelta(hours=8) + +BEDFORD = 100 # not a year + +TT_HOST = "webtimetables.royalholloway.ac.uk" +TT_HOME_URL = f"https://{TT_HOST}/" +BASE_URL = f"{TT_HOME_URL}/SWS/SDB{ACADEMIC_YEAR}SWS" +TT_LOGIN_URL = f"{BASE_URL}/Login.aspx" +TT_DEFAULT_URL = f"{BASE_URL}/default.aspx" + +USER_AGENT \ + = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0" + +ONLINE_ACTIVITY_RE = re.compile(".*Online.*", re.IGNORECASE) +# Locations, all caps, at least one letter +LOCATION_RE = re.compile(r"^\s*[-A-Z\d\s]*[A-Z][-A-Z\d\s]*\s*$") +GROUP_RE = re.compile(r"Group (\d+)", re.IGNORECASE) +GROUP_NUM_GRP = 1 +MODULE_LIST_RE = re.compile( + r"\s*([A-Z]{2}[0-9]{4}[A-Z]?(,\s+)?)+\s*", re.IGNORECASE +) +MODULE_LIST_SEP = "," + +# Because my Python environment doesn't use system certs to verify https +# unless it's told to using verify=CA_CERTS in code below +# Set to True to use your default certs +CA_CERTS = "/etc/ssl/certs/ca-certificates.crt" + +HEADERS = { + "Host": TT_HOST, + "Origin": TT_HOME_URL, + "Referer": TT_DEFAULT_URL, + "User-Agent": USER_AGENT +} + +MOD_CODE_RE = re.compile(r"[A-Z]{2}\d{4}[A-Z]?") +STAFF_RE = re.compile(r"[A-Z]*, \w*") + +# should be lower +DAY_STARTS = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"] +# ical format +CAL_DAYS = [ "MO", "TU", "WE", "TH", "FR", "SA", "SU" ] + +@dataclass(frozen=True) +class Module: + mod_code : str + tt_mod_id : str + +def update_form_data(response, form_data): + soup = BeautifulSoup(response.text, "lxml") + + def update_field(field): + vs = soup.find(id=field) + if vs is not None: + form_data[field] = vs.get("value") + + update_field("__VIEWSTATE") + update_field("__VIEWSTATEGENERATOR") + update_field("__EVENTTARGET") + update_field("__EVENTARGUMENT") + update_field("__EVENTVALIDATION") + +def initialise_form_data(form_data): + response = requests.get( + TT_HOME_URL, + headers=HEADERS, + verify=CA_CERTS + ) + update_form_data(response, form_data) + +def login( + form_data : Dict[str, str], password : str +) -> requests.cookies.RequestsCookieJar: + response = requests.post( + TT_LOGIN_URL, + data = form_data | { + "tUserName": USERNAME, + "tPassword": password, + "bLogin": "Login" + }, + headers=HEADERS, + allow_redirects=False, + verify=CA_CERTS + ) + return response.cookies + +def make_request( + form_data : Dict[str, str], cookies : requests.cookies.RequestsCookieJar +): + response = requests.post( + TT_DEFAULT_URL, + data=form_data, + cookies=cookies, + headers=HEADERS, + verify=CA_CERTS + ) + update_form_data(response, form_data) + return response.text + +def get_weeks(weeks : str) -> Generator[int, None, None]: + for period in weeks.split(","): + bounds = period.strip().split("-") + if len(bounds) == 1: + yield int(bounds[0]) + elif len(bounds) == 2: + for week in range(int(bounds[0]), int(bounds[1]) + 1): + yield week + else: + raise ValueError("Unknown weeks pattern {period}") + +def get_time(week : int, day_offset : int, act_time : timedelta) -> datetime: + day = BASE_DATE + timedelta(days=7 * (week - 1) + day_offset) + return day + act_time + +def is_row_label(cell : Tag) -> bool: + if "class" not in cell.attrs: + return False + for cl in cell.attrs["class"]: + if cl.lower().startswith("row-label"): + return True + return False + +def get_activity(cell : Tag) -> Optional[str]: + if "class" in cell.attrs: + acts = ACTIVITIES & set(cell["class"]) + if len(acts) > 0: + return next(iter(acts)) + return None + +def is_location(cell : str) -> bool: + return LOCATION_RE.match(cell) is not None + +def is_online(activity : str) -> bool: + return ONLINE_ACTIVITY_RE.match(activity) is not None + +def get_day_index(day : str) -> int: + lower_day = day.lower() + for (idx, day_start) in enumerate(DAY_STARTS): + if lower_day.startswith(day_start): + return idx + return -1 + +def parse_timetable( + mod_code : str, + timetable : str, + cals : Dict[int, Calendar] +): + soup = BeautifulSoup(timetable, "lxml") + for ttable in soup.body.findChildren("table", recursive=False): + if "class" not in ttable.attrs: + continue + table_type = ttable["class"] + if "grid-border-args" in table_type: + tt_rows = ttable.findChildren("tr", recursive=False) + weeks = "" + day_idx = 0 # Monday + for (row_idx, tt_row) in enumerate(tt_rows): + if row_idx == 1: + weeks = tt_row.find("td", { "class": "row-label-two" }).text + + day_label = tt_row.find("td", { "class": "row-label-one" }) + if day_label: + day_idx = get_day_index(day_label.text) + + cell_pos = 0 + for cell in tt_row.findChildren("td", recursive=False): + colspan = int(cell.attrs.get("colspan", "1")) + + activity = get_activity(cell) + if activity: + activity_time = TT_BASETIME \ + + cell_pos * timedelta(minutes=30) + group = "" + module_list = [mod_code] + location = "ONLINE" if is_online(activity) else "" + length = colspan * timedelta(minutes=30) + description = re.sub(r"\n+", "\n", cell.text) + + for cell in cell.find_all("td"): + if m := GROUP_RE.match(cell.text): + group = m[GROUP_NUM_GRP] + elif MODULE_LIST_RE.fullmatch(cell.text): + module_list = [ + mod_code.strip() + for mod_code in cell.text.split( + MODULE_LIST_SEP + ) + ] + elif is_location(cell.text): + location = cell.text + + tt_weeks = list(get_weeks(weeks)) + start_time = get_time( + tt_weeks[0], day_idx, activity_time + ) + end_time = start_time + length + + year_weeks = list(map( + lambda w: get_time( + w, day_idx, activity_time + ).isocalendar()[1], + tt_weeks + )) + + event = Event() + mod_codes = "/".join(sorted(module_list)) + name = f"{mod_codes} {activity} ({group})" + event.add("summary", name) + event.add("description", description) + event.add("location", location) + event.add("dtstart", start_time) + event.add("dtend", end_time) + event.add("rrule", { + "freq": "YEARLY", + "count": len(tt_weeks), + "byweekno": year_weeks, + "byday": CAL_DAYS[day_idx] + }) + + for year in get_years(module_list): + # HACK: do this more efficiently and will + # better duplicate detection + if event not in cals[year].walk(): + cals[year].add_component(event) + + if not is_row_label(cell): + cell_pos += colspan + +def get_years(mod_codes : List[str]) -> Set[int]: + return set(int(name[2]) for name in mod_codes) + +def get_department_page( + cookies : requests.cookies.RequestsCookieJar, + department : str, + form_data : Dict[str, str] +) -> BeautifulSoup: + """Returns a soup of the department page + Updates form data for the next request""" + form_data.update({ "__EVENTTARGET": "LinkBtn_module" }) + module_page = make_request(form_data, cookies) + + soup = BeautifulSoup(module_page, "lxml") + dept_id : str = soup.find(id="dlFilter2") \ + .find("option", string=department) \ + .get("value") + + # select the department from the departments list (loads module list) + form_data.update({ + "__EVENTTARGET": "dlFilter2", + "tLinkType": "module", + "dlFilter2": dept_id + }) + + dept_page = make_request(form_data, cookies) + + return BeautifulSoup(dept_page, "lxml") + + +def get_department_modules( + cookies : requests.cookies.RequestsCookieJar, + init_form_data : Dict[str, str], + department : str, +) -> Set[Module]: + """Gets list of module IDs for department""" + form_data = dict(init_form_data) + soup = get_department_page(cookies, department, form_data) + return set( + Module( + # parse e.g. CS1811 | Object Oriented Programming I + option.text.split(" ")[0], + option.get("value") + ) + for option in soup.find(id="dlObject").find_all("option") + if MODULES_RE.fullmatch(option.text) + ) + +def get_module_timetable( + cookies : requests.cookies.RequestsCookieJar, + init_form_data : Dict[str, str], + department : str, + module : Module, + cals : Dict[int, Calendar] +): + """Get the timetable for the module with the given id + The id should come from the timetabling system, not the module code. + See get_department_modules""" + form_data = dict(init_form_data) + soup = get_department_page(cookies, department, form_data) + + weeks = None + for option in soup.find(id="lbWeeks").find_all("option"): + val_weeks = option.get("value") + val_name = option.text + if "all" in val_name.lower(): + weeks = val_weeks + break + + if weeks is not None: + form_data.update({ + "dlObject": module.tt_mod_id, + "lbWeeks": weeks, + "lbDays": "1-5", + "dlPeriod": "1-28", + "RadioType": "individual;swsurl;swsurl", + "bGetTimetable": "View Timetable", + }) + + timetable = make_request(form_data, cookies) + + parse_timetable(module.mod_code, timetable, cals) + +def get_timetables(password : str): + cals = { year: Calendar() for year in range(0, 6) } + cals.update({ BEDFORD : Calendar() }) + + form_data = dict() + initialise_form_data(form_data) + cookies = login(form_data, password) + + # need to refresh form data with a blank request + make_request(form_data, cookies) + + initial_form_data = dict(form_data) + + for department in DEPARTMENTS: + modules = get_department_modules( + cookies, initial_form_data, department + ) + for module in modules: + print("Getting", module.mod_code) + get_module_timetable( + cookies, initial_form_data, department, module, cals + ) + + return cals + +def save_cals(cals : Dict[int, Calendar]): + for year in cals: + if year != BEDFORD: + with open(f"Year{year}.ics", 'wb') as icsfile: + icsfile.write(cals[year].to_ical()) + + with open(f"Bedford.ics", 'wb') as icsfile: + icsfile.write(cals[BEDFORD].to_ical()) + +password = getpass(f"Password for {USERNAME}: ") +cals = get_timetables(password) +save_cals(cals) + +