Skip to content
Snippets Groups Projects
Commit 3948f3b0 authored by Hague Matthew UXAC009's avatar Hague Matthew UXAC009
Browse files

feat(timetable): add option to scrape room timetables to scraper

parent 5c2376cf
Branches
No related tags found
No related merge requests found
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
* `timetable-ics.py` -- convert the Timetabling team spreadsheets into ICS format. This is a rough and ready script and will probably need tweaks to your particular spreadsheet. * `timetable-ics.py` -- convert the Timetabling team spreadsheets into ICS format. This is a rough and ready script and will probably need tweaks to your particular spreadsheet.
* `get-practical-groups.py` -- find out which lab groups your students belong to on the web timetable. * `get-practical-groups.py` -- find out which lab groups your students belong to on the web timetable.
* `scrape-timetable.py` -- pull ICS files from the timetabling system for departments/modules. Needs some config. * `scrape-timetable.py` -- pull ICS files from the timetabling system for departments/modules/rooms. Needs some config. Use "rooms" as a command line argument to get rooms, else gets modules.
To view the generated .ics files, you might like [ttcal][ttcal] (or just import into your favouring calendar app/website). To view the generated .ics files, you might like [ttcal][ttcal] (or just import into your favouring calendar app/website).
......
# Scrapes timetable from web timetable system # Scrapes timetable from web timetable system
# #
# Run with "rooms" argument to get timetable for rooms on campus
#
# Configure: # Configure:
# #
# * ACADEMIC_YEAR -- e.g. 202425 # * ACADEMIC_YEAR -- e.g. 202425
...@@ -12,8 +14,10 @@ ...@@ -12,8 +14,10 @@
import re import re
import requests import requests
import pytz import pytz
import sys
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from collections import defaultdict
from datetime import datetime, timedelta from datetime import datetime, timedelta
from dataclasses import dataclass from dataclasses import dataclass
from getpass import getpass from getpass import getpass
...@@ -32,19 +36,39 @@ DEPARTMENTS = [ ...@@ -32,19 +36,39 @@ DEPARTMENTS = [
"Mathematics and Information Security Group" "Mathematics and Information Security Group"
] ]
ROOM_SETS = [
"Central Booking",
"Computer Science"
]
# Match modules to get tt for # Match modules to get tt for
# Match module format e.g CS1811 | Object Oriented Programming I # Match module format e.g CS1811 | Object Oriented Programming I
MODULES_RE = re.compile("CS.*|IY.*|PC.*|DC.*") MODULES_RE = re.compile("CS.*|IY.*|PC.*|DC.*")
ACTIVITIES = set([ ACTIVITIES = {
"Practical", "Assessment",
"Optional_Attendance",
"Lecture", "Lecture",
"Lecture_-_Online", "Lecture_-_Online",
"Workshop", "Optional_Attendance",
"Optional_Attendance_-_Online",
"Practical",
"Practical_-_Online", "Practical_-_Online",
"Workshop",
"Workshop_-_Online" "Workshop_-_Online"
]) }
# td class of timetable cell that is not an activity
ALL_ACTIVITIES = ACTIVITIES | {
"Tutorial",
"Booking",
"Booking_Accepted",
"Booking_Requested",
"Drop_In",
"Employability",
"Rescheduled",
"Seminar",
"Tutorial",
}
# Time delta from midnight # Time delta from midnight
TT_BASETIME = timedelta(hours=8) TT_BASETIME = timedelta(hours=8)
...@@ -95,6 +119,11 @@ class Module: ...@@ -95,6 +119,11 @@ class Module:
mod_code : str mod_code : str
tt_mod_id : str tt_mod_id : str
@dataclass(frozen=True)
class Room:
room_name: str
tt_name_id : str
@dataclass(frozen=True) @dataclass(frozen=True)
class Fingerprint: class Fingerprint:
mod_code : str mod_code : str
...@@ -160,6 +189,9 @@ def make_request( ...@@ -160,6 +189,9 @@ def make_request(
return response.text return response.text
def get_weeks(weeks : str) -> Generator[int, None, None]: def get_weeks(weeks : str) -> Generator[int, None, None]:
if len(weeks.strip()) == 0:
return
for period in weeks.split(","): for period in weeks.split(","):
bounds = period.strip().split("-") bounds = period.strip().split("-")
if len(bounds) == 1: if len(bounds) == 1:
...@@ -189,6 +221,13 @@ def get_activity(cell : Tag) -> Optional[str]: ...@@ -189,6 +221,13 @@ def get_activity(cell : Tag) -> Optional[str]:
return next(iter(acts)) return next(iter(acts))
return None return None
def is_activity(cell : Tag) -> bool:
"""Any kind of activity not just those in ACTIVITIES"""
if "class" in cell.attrs:
return len(set(cell["class"]) & ALL_ACTIVITIES) > 0
else:
return False
def is_location(cell : str) -> bool: def is_location(cell : str) -> bool:
return LOCATION_RE.match(cell) is not None return LOCATION_RE.match(cell) is not None
...@@ -202,7 +241,7 @@ def get_day_index(day : str) -> int: ...@@ -202,7 +241,7 @@ def get_day_index(day : str) -> int:
return idx return idx
return -1 return -1
def parse_timetable( def parse_module_timetable(
mod_code : str, mod_code : str,
timetable : str, timetable : str,
cals : Dict[int, FingerCal] cals : Dict[int, FingerCal]
...@@ -279,7 +318,9 @@ def parse_timetable( ...@@ -279,7 +318,9 @@ def parse_timetable(
"byday": CAL_DAYS[day_idx] "byday": CAL_DAYS[day_idx]
}) })
fingerprint = Fingerprint(mod_codes, location, start_time, end_time) fingerprint = Fingerprint(
mod_codes, location, start_time, end_time
)
for year in get_years(module_list): for year in get_years(module_list):
# HACK: do this more efficiently and will # HACK: do this more efficiently and will
...@@ -299,32 +340,32 @@ def parse_timetable( ...@@ -299,32 +340,32 @@ def parse_timetable(
def get_years(mod_codes : List[str]) -> Set[int]: def get_years(mod_codes : List[str]) -> Set[int]:
return set(int(name[2]) for name in mod_codes) return set(int(name[2]) for name in mod_codes)
def get_department_page( def get_set_page(
cookies : requests.cookies.RequestsCookieJar, cookies : requests.cookies.RequestsCookieJar,
department : str, link_type : str,
data_set_name : str,
form_data : Dict[str, str] form_data : Dict[str, str]
) -> BeautifulSoup: ) -> BeautifulSoup:
"""Returns a soup of the department page """Returns a soup of the locations page
Updates form data for the next request""" Updates form data for the next request"""
form_data.update({ "__EVENTTARGET": "LinkBtn_module" }) form_data.update({ "__EVENTTARGET": f"LinkBtn_{link_type}" })
module_page = make_request(form_data, cookies) module_page = make_request(form_data, cookies)
soup = BeautifulSoup(module_page, "lxml") soup = BeautifulSoup(module_page, "lxml")
dept_id : str = soup.find(id="dlFilter2") \ data_set_id : str = soup.find(id="dlFilter2") \
.find("option", string=department) \ .find("option", string=data_set_name) \
.get("value") .get("value")
# select the department from the departments list (loads module list) # select the department from the departments list (loads module list)
form_data.update({ form_data.update({
"__EVENTTARGET": "dlFilter2", "__EVENTTARGET": "dlFilter2",
"tLinkType": "module", "tLinkType": link_type,
"dlFilter2": dept_id "dlFilter2": data_set_id
}) })
dept_page = make_request(form_data, cookies) set_page = make_request(form_data, cookies)
return BeautifulSoup(dept_page, "lxml")
return BeautifulSoup(set_page, "lxml")
def get_department_modules( def get_department_modules(
cookies : requests.cookies.RequestsCookieJar, cookies : requests.cookies.RequestsCookieJar,
...@@ -333,7 +374,7 @@ def get_department_modules( ...@@ -333,7 +374,7 @@ def get_department_modules(
) -> Set[Module]: ) -> Set[Module]:
"""Gets list of module IDs for department""" """Gets list of module IDs for department"""
form_data = dict(init_form_data) form_data = dict(init_form_data)
soup = get_department_page(cookies, department, form_data) soup = get_set_page(cookies, "module", department, form_data)
return set( return set(
Module( Module(
# parse e.g. CS1811 | Object Oriented Programming I # parse e.g. CS1811 | Object Oriented Programming I
...@@ -355,7 +396,7 @@ def get_module_timetable( ...@@ -355,7 +396,7 @@ def get_module_timetable(
The id should come from the timetabling system, not the module code. The id should come from the timetabling system, not the module code.
See get_department_modules""" See get_department_modules"""
form_data = dict(init_form_data) form_data = dict(init_form_data)
soup = get_department_page(cookies, department, form_data) soup = get_set_page(cookies, "module", department, form_data)
weeks = None weeks = None
for option in soup.find(id="lbWeeks").find_all("option"): for option in soup.find(id="lbWeeks").find_all("option"):
...@@ -377,9 +418,9 @@ def get_module_timetable( ...@@ -377,9 +418,9 @@ def get_module_timetable(
timetable = make_request(form_data, cookies) timetable = make_request(form_data, cookies)
parse_timetable(module.mod_code, timetable, cals) parse_module_timetable(module.mod_code, timetable, cals)
def get_timetables(password : str): def get_timetables_years(password : str):
cals = { year: FingerCal(Calendar(), set()) for year in range(0, 6) } cals = { year: FingerCal(Calendar(), set()) for year in range(0, 6) }
cals.update({ BEDFORD : FingerCal(Calendar(), set()) }) cals.update({ BEDFORD : FingerCal(Calendar(), set()) })
...@@ -404,7 +445,7 @@ def get_timetables(password : str): ...@@ -404,7 +445,7 @@ def get_timetables(password : str):
return cals return cals
def save_cals(cals : Dict[int, FingerCal]): def save_cals_years(cals : Dict[int, FingerCal]):
for year in cals: for year in cals:
if year != BEDFORD: if year != BEDFORD:
with open(f"Year{year}.ics", 'wb') as icsfile: with open(f"Year{year}.ics", 'wb') as icsfile:
...@@ -413,8 +454,165 @@ def save_cals(cals : Dict[int, FingerCal]): ...@@ -413,8 +454,165 @@ def save_cals(cals : Dict[int, FingerCal]):
with open(f"Bedford.ics", 'wb') as icsfile: with open(f"Bedford.ics", 'wb') as icsfile:
icsfile.write(cals[BEDFORD].cal.to_ical()) icsfile.write(cals[BEDFORD].cal.to_ical())
def parse_room_timetable(
room_name : str,
timetable : str,
cals : Dict[str, Calendar]
):
soup = BeautifulSoup(timetable, "lxml")
capacity = int(soup.body.find("span", { "class": "header-1-0-1" }).text)
room_title = f"CAP {capacity:03} {room_name}"
for ttable in soup.body.findChildren("table", recursive=False):
if "class" not in ttable.attrs:
continue
table_type = ttable["class"]
if "grid-border-args" in table_type:
tt_rows = ttable.findChildren("tr", recursive=False)
weeks = ""
day_idx = 0 # Monday
for (row_idx, tt_row) in enumerate(tt_rows):
if row_idx == 1:
weeks = tt_row.find("td", { "class": "row-label-two" }).text
day_label = tt_row.find("td", { "class": "row-label-one" })
if day_label:
day_idx = get_day_index(day_label.text)
cell_pos = 0
for cell in tt_row.findChildren("td", recursive=False):
colspan = int(cell.attrs.get("colspan", "1"))
if is_activity(cell):
activity_time = TT_BASETIME \
+ cell_pos * timedelta(minutes=30)
length = colspan * timedelta(minutes=30)
description = re.sub(r"\n+", "\n", cell.text)
tt_weeks = list(get_weeks(weeks))
start_time = get_time(
tt_weeks[0], day_idx, activity_time
)
end_time = start_time + length
year_weeks = list(map(
lambda w: get_time(
w, day_idx, activity_time
).isocalendar()[1],
tt_weeks
))
event = Event()
summary = next(
(
line
for line in description.split("\n")
if len(line) > 0
),
"Unknown Event"
)
event.add("summary", summary)
event.add("description", description)
event.add("location", f"{room_name} ({capacity})")
event.add("dtstart", start_time)
event.add("dtend", end_time)
event.add("rrule", {
"freq": "YEARLY",
"count": len(tt_weeks),
"byweekno": year_weeks,
"byday": CAL_DAYS[day_idx]
})
cals[room_title].add_component(event)
if not is_row_label(cell):
cell_pos += colspan
def get_rooms(
cookies : requests.cookies.RequestsCookieJar,
init_form_data : Dict[str, str],
room_set : str,
) -> Set[Room]:
"""Gets list of room IDs in set"""
form_data = dict(init_form_data)
soup = get_set_page(cookies, "location", room_set, form_data)
return set(
Room(option.text, option.get("value"))
for option in soup.find(id="dlObject").find_all("option")
)
def get_room_timetable(
cookies : requests.cookies.RequestsCookieJar,
init_form_data : Dict[str, str],
room_set : str,
room : Room,
cals : Dict[str, Calendar]
):
form_data = dict(init_form_data)
soup = get_set_page(cookies, "location", room_set, form_data)
weeks = None
for option in soup.find(id="lbWeeks").find_all("option"):
val_weeks = option.get("value")
val_name = option.text
if "all" in val_name.lower():
weeks = val_weeks
break
if weeks is not None:
form_data.update({
"dlObject": room.tt_name_id,
"lbWeeks": weeks,
"lbDays": "1-5",
"dlPeriod": "1-28",
"RadioType": "individual;swsurl;swsurl",
"bGetTimetable": "View Timetable",
})
timetable = make_request(form_data, cookies)
parse_room_timetable(room.room_name, timetable, cals)
def get_timetables_rooms(password : str):
cals = defaultdict(Calendar)
form_data = dict()
initialise_form_data(form_data)
cookies = login(form_data, password)
# need to refresh form data with a blank request
make_request(form_data, cookies)
initial_form_data = dict(form_data)
for room_set in ROOM_SETS:
rooms = get_rooms(
cookies, initial_form_data, room_set
)
for room in rooms:
print("Getting", room.room_name)
get_room_timetable(
cookies, initial_form_data, room_set, room, cals
)
return cals
def save_cals_rooms(cals : Dict[str, Calendar]):
for room in cals:
filename = f"room-{room.replace(" ", "-")}.ics"
with open(filename, 'wb') as icsfile:
icsfile.write(cals[room].to_ical())
rooms_mode = "rooms" in sys.argv
password = getpass(f"Password for {USERNAME}: ") password = getpass(f"Password for {USERNAME}: ")
cals = get_timetables(password)
save_cals(cals)
if rooms_mode:
cals = get_timetables_rooms(password)
save_cals_rooms(cals)
else:
cals = get_timetables_years(password)
save_cals_years(cals)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment