Skip to content
Snippets Groups Projects
Commit 1b84aa2f authored by Hague Matthew UXAC009's avatar Hague Matthew UXAC009
Browse files

add(timetable): add scrape-timetable script

parent c2626e39
Branches
No related tags found
No related merge requests found
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
* `timetable-ics.py` -- convert the Timetabling team spreadsheets into ICS format. This is a rough and ready script and will probably need tweaks to your particular spreadsheet. * `timetable-ics.py` -- convert the Timetabling team spreadsheets into ICS format. This is a rough and ready script and will probably need tweaks to your particular spreadsheet.
* `get-practical-groups.py` -- find out which lab groups your students belong to on the web timetable. * `get-practical-groups.py` -- find out which lab groups your students belong to on the web timetable.
* `scrape-timetable.py` -- pull ICS files from the timetabling system for departments/modules. Needs some config.
To view the generated .ics files, you might like [ttcal][ttcal] (or just import into your favouring calendar app/website). To view the generated .ics files, you might like [ttcal][ttcal] (or just import into your favouring calendar app/website).
......
# Scrapes timetable from web timetable system
#
# Configure:
#
# * ACADEMIC_YEAR -- e.g. 202425
# * USERNAME -- e.g. uxac009
# * BASE_DATE -- the start of Term 1 for the academic year
# * DEPARTMENTS -- which depts to get activities from
# * MODULES_RE -- filter which modules to get data for
# * CA_CERTS -- path to CA_CERTS file (needed for me, maybe not for you)
import re
import requests
import pytz
from bs4 import BeautifulSoup, Tag
from datetime import datetime, timedelta
from dataclasses import dataclass
from getpass import getpass
from icalendar import Calendar, Event
from typing import Dict, Generator, List, Optional, Set
ACADEMIC_YEAR = "2425"
USERNAME = "uxac009"
TIMEZONE = "Europe/London"
# Start of Term 1 (Week 1) for academic year
BASE_DATE = datetime(2024, 9, 23, 0, 0, 0, 0, pytz.timezone(TIMEZONE))
DEPARTMENTS = [
"Computer Science",
"Mathematics and Information Security Group"
]
# Match modules to get tt for
# Match module format e.g CS1811 | Object Oriented Programming I
MODULES_RE = re.compile("CS.*|IY.*|PC.*|DC.*")
ACTIVITIES = set([
"Practical",
"Optional_Attendance",
"Lecture",
"Lecture_-_Online",
"Workshop",
"Practical_-_Online",
"Workshop_-_Online"
])
# Time delta from midnight
TT_BASETIME = timedelta(hours=8)
BEDFORD = 100 # not a year
TT_HOST = "webtimetables.royalholloway.ac.uk"
TT_HOME_URL = f"https://{TT_HOST}/"
BASE_URL = f"{TT_HOME_URL}/SWS/SDB{ACADEMIC_YEAR}SWS"
TT_LOGIN_URL = f"{BASE_URL}/Login.aspx"
TT_DEFAULT_URL = f"{BASE_URL}/default.aspx"
USER_AGENT \
= "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0"
ONLINE_ACTIVITY_RE = re.compile(".*Online.*", re.IGNORECASE)
# Locations, all caps, at least one letter
LOCATION_RE = re.compile(r"^\s*[-A-Z\d\s]*[A-Z][-A-Z\d\s]*\s*$")
GROUP_RE = re.compile(r"Group (\d+)", re.IGNORECASE)
GROUP_NUM_GRP = 1
MODULE_LIST_RE = re.compile(
r"\s*([A-Z]{2}[0-9]{4}[A-Z]?(,\s+)?)+\s*", re.IGNORECASE
)
MODULE_LIST_SEP = ","
# Because my Python environment doesn't use system certs to verify https
# unless it's told to using verify=CA_CERTS in code below
# Set to True to use your default certs
CA_CERTS = "/etc/ssl/certs/ca-certificates.crt"
HEADERS = {
"Host": TT_HOST,
"Origin": TT_HOME_URL,
"Referer": TT_DEFAULT_URL,
"User-Agent": USER_AGENT
}
MOD_CODE_RE = re.compile(r"[A-Z]{2}\d{4}[A-Z]?")
STAFF_RE = re.compile(r"[A-Z]*, \w*")
# should be lower
DAY_STARTS = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]
# ical format
CAL_DAYS = [ "MO", "TU", "WE", "TH", "FR", "SA", "SU" ]
@dataclass(frozen=True)
class Module:
mod_code : str
tt_mod_id : str
def update_form_data(response, form_data):
soup = BeautifulSoup(response.text, "lxml")
def update_field(field):
vs = soup.find(id=field)
if vs is not None:
form_data[field] = vs.get("value")
update_field("__VIEWSTATE")
update_field("__VIEWSTATEGENERATOR")
update_field("__EVENTTARGET")
update_field("__EVENTARGUMENT")
update_field("__EVENTVALIDATION")
def initialise_form_data(form_data):
response = requests.get(
TT_HOME_URL,
headers=HEADERS,
verify=CA_CERTS
)
update_form_data(response, form_data)
def login(
form_data : Dict[str, str], password : str
) -> requests.cookies.RequestsCookieJar:
response = requests.post(
TT_LOGIN_URL,
data = form_data | {
"tUserName": USERNAME,
"tPassword": password,
"bLogin": "Login"
},
headers=HEADERS,
allow_redirects=False,
verify=CA_CERTS
)
return response.cookies
def make_request(
form_data : Dict[str, str], cookies : requests.cookies.RequestsCookieJar
):
response = requests.post(
TT_DEFAULT_URL,
data=form_data,
cookies=cookies,
headers=HEADERS,
verify=CA_CERTS
)
update_form_data(response, form_data)
return response.text
def get_weeks(weeks : str) -> Generator[int, None, None]:
for period in weeks.split(","):
bounds = period.strip().split("-")
if len(bounds) == 1:
yield int(bounds[0])
elif len(bounds) == 2:
for week in range(int(bounds[0]), int(bounds[1]) + 1):
yield week
else:
raise ValueError("Unknown weeks pattern {period}")
def get_time(week : int, day_offset : int, act_time : timedelta) -> datetime:
day = BASE_DATE + timedelta(days=7 * (week - 1) + day_offset)
return day + act_time
def is_row_label(cell : Tag) -> bool:
if "class" not in cell.attrs:
return False
for cl in cell.attrs["class"]:
if cl.lower().startswith("row-label"):
return True
return False
def get_activity(cell : Tag) -> Optional[str]:
if "class" in cell.attrs:
acts = ACTIVITIES & set(cell["class"])
if len(acts) > 0:
return next(iter(acts))
return None
def is_location(cell : str) -> bool:
return LOCATION_RE.match(cell) is not None
def is_online(activity : str) -> bool:
return ONLINE_ACTIVITY_RE.match(activity) is not None
def get_day_index(day : str) -> int:
lower_day = day.lower()
for (idx, day_start) in enumerate(DAY_STARTS):
if lower_day.startswith(day_start):
return idx
return -1
def parse_timetable(
mod_code : str,
timetable : str,
cals : Dict[int, Calendar]
):
soup = BeautifulSoup(timetable, "lxml")
for ttable in soup.body.findChildren("table", recursive=False):
if "class" not in ttable.attrs:
continue
table_type = ttable["class"]
if "grid-border-args" in table_type:
tt_rows = ttable.findChildren("tr", recursive=False)
weeks = ""
day_idx = 0 # Monday
for (row_idx, tt_row) in enumerate(tt_rows):
if row_idx == 1:
weeks = tt_row.find("td", { "class": "row-label-two" }).text
day_label = tt_row.find("td", { "class": "row-label-one" })
if day_label:
day_idx = get_day_index(day_label.text)
cell_pos = 0
for cell in tt_row.findChildren("td", recursive=False):
colspan = int(cell.attrs.get("colspan", "1"))
activity = get_activity(cell)
if activity:
activity_time = TT_BASETIME \
+ cell_pos * timedelta(minutes=30)
group = ""
module_list = [mod_code]
location = "ONLINE" if is_online(activity) else ""
length = colspan * timedelta(minutes=30)
description = re.sub(r"\n+", "\n", cell.text)
for cell in cell.find_all("td"):
if m := GROUP_RE.match(cell.text):
group = m[GROUP_NUM_GRP]
elif MODULE_LIST_RE.fullmatch(cell.text):
module_list = [
mod_code.strip()
for mod_code in cell.text.split(
MODULE_LIST_SEP
)
]
elif is_location(cell.text):
location = cell.text
tt_weeks = list(get_weeks(weeks))
start_time = get_time(
tt_weeks[0], day_idx, activity_time
)
end_time = start_time + length
year_weeks = list(map(
lambda w: get_time(
w, day_idx, activity_time
).isocalendar()[1],
tt_weeks
))
event = Event()
mod_codes = "/".join(sorted(module_list))
name = f"{mod_codes} {activity} ({group})"
event.add("summary", name)
event.add("description", description)
event.add("location", location)
event.add("dtstart", start_time)
event.add("dtend", end_time)
event.add("rrule", {
"freq": "YEARLY",
"count": len(tt_weeks),
"byweekno": year_weeks,
"byday": CAL_DAYS[day_idx]
})
for year in get_years(module_list):
# HACK: do this more efficiently and will
# better duplicate detection
if event not in cals[year].walk():
cals[year].add_component(event)
if not is_row_label(cell):
cell_pos += colspan
def get_years(mod_codes : List[str]) -> Set[int]:
return set(int(name[2]) for name in mod_codes)
def get_department_page(
cookies : requests.cookies.RequestsCookieJar,
department : str,
form_data : Dict[str, str]
) -> BeautifulSoup:
"""Returns a soup of the department page
Updates form data for the next request"""
form_data.update({ "__EVENTTARGET": "LinkBtn_module" })
module_page = make_request(form_data, cookies)
soup = BeautifulSoup(module_page, "lxml")
dept_id : str = soup.find(id="dlFilter2") \
.find("option", string=department) \
.get("value")
# select the department from the departments list (loads module list)
form_data.update({
"__EVENTTARGET": "dlFilter2",
"tLinkType": "module",
"dlFilter2": dept_id
})
dept_page = make_request(form_data, cookies)
return BeautifulSoup(dept_page, "lxml")
def get_department_modules(
cookies : requests.cookies.RequestsCookieJar,
init_form_data : Dict[str, str],
department : str,
) -> Set[Module]:
"""Gets list of module IDs for department"""
form_data = dict(init_form_data)
soup = get_department_page(cookies, department, form_data)
return set(
Module(
# parse e.g. CS1811 | Object Oriented Programming I
option.text.split(" ")[0],
option.get("value")
)
for option in soup.find(id="dlObject").find_all("option")
if MODULES_RE.fullmatch(option.text)
)
def get_module_timetable(
cookies : requests.cookies.RequestsCookieJar,
init_form_data : Dict[str, str],
department : str,
module : Module,
cals : Dict[int, Calendar]
):
"""Get the timetable for the module with the given id
The id should come from the timetabling system, not the module code.
See get_department_modules"""
form_data = dict(init_form_data)
soup = get_department_page(cookies, department, form_data)
weeks = None
for option in soup.find(id="lbWeeks").find_all("option"):
val_weeks = option.get("value")
val_name = option.text
if "all" in val_name.lower():
weeks = val_weeks
break
if weeks is not None:
form_data.update({
"dlObject": module.tt_mod_id,
"lbWeeks": weeks,
"lbDays": "1-5",
"dlPeriod": "1-28",
"RadioType": "individual;swsurl;swsurl",
"bGetTimetable": "View Timetable",
})
timetable = make_request(form_data, cookies)
parse_timetable(module.mod_code, timetable, cals)
def get_timetables(password : str):
cals = { year: Calendar() for year in range(0, 6) }
cals.update({ BEDFORD : Calendar() })
form_data = dict()
initialise_form_data(form_data)
cookies = login(form_data, password)
# need to refresh form data with a blank request
make_request(form_data, cookies)
initial_form_data = dict(form_data)
for department in DEPARTMENTS:
modules = get_department_modules(
cookies, initial_form_data, department
)
for module in modules:
print("Getting", module.mod_code)
get_module_timetable(
cookies, initial_form_data, department, module, cals
)
return cals
def save_cals(cals : Dict[int, Calendar]):
for year in cals:
if year != BEDFORD:
with open(f"Year{year}.ics", 'wb') as icsfile:
icsfile.write(cals[year].to_ical())
with open(f"Bedford.ics", 'wb') as icsfile:
icsfile.write(cals[BEDFORD].to_ical())
password = getpass(f"Password for {USERNAME}: ")
cals = get_timetables(password)
save_cals(cals)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment