diff --git a/campus/fhda/fhda_campus.py b/campus/fhda/fhda_campus.py index 459a4df..724471e 100644 --- a/campus/fhda/fhda_campus.py +++ b/campus/fhda/fhda_campus.py @@ -37,20 +37,21 @@ def load_db(self, campus, year, quarter): name = f'{year}{quarter_num}{self.CAMPUS_TO_NUM[campus]}' try: - db = TinyDB(join(DB_DIR, f'{name}_database.json'), access_mode='r') + db = TinyDB(join(DB_DIR, f'merge_{name}_database.json'), access_mode='r') except FileNotFoundError: - # raise FileNotFoundError try: - db = TinyDB(join(DB_DIR, f'new_{name}_database.json'), access_mode='r') + db = TinyDB(join(DB_DIR, f'sched_{name}_database.json'), access_mode='r') except FileNotFoundError: - # raise FileNotFoundError try: - db = TinyDB(join(DB_DIR, f'sched_{name}_database.json'), access_mode='r') + db = TinyDB(join(DB_DIR, f'new_{name}_database.json'), access_mode='r') except FileNotFoundError: raise FileNotFoundError return db + def load_multi_db(self, campus): + return TinyDB(join(DB_DIR, 'multi_database.json'), access_mode='r') + def list_dbs(self, campus): with open(join(DB_DIR, 'metadata.json'), 'r') as file: metadata = json.loads(file.read()) diff --git a/campus/fhda/fhda_scrape.py b/campus/fhda/fhda_scrape.py index 0832ece..0a68e89 100644 --- a/campus/fhda/fhda_scrape.py +++ b/campus/fhda/fhda_scrape.py @@ -5,15 +5,19 @@ from collections import defaultdict from titlecase import titlecase +from tinydb import TinyDB, where +from tinydb.storages import JSONStorage +from tinydb.middlewares import CachingMiddleware -from logger import log_err, log_warn +from logger import log, log_info, log_err, log_warn from data.utils import list_dbs from scraper.ssb_base import BaseHooks from scraper.ssb_auth_schedule import AdvancedScraper from scraper.ssb_public_schedule import ScheduleScraper +from scraper.postprocess import postprocess_dbs from .fhda_login import login -from .fhda_settings import SSB_URL, DB_DIR, CACHE_DIR +from .fhda_settings import SSB_URL, DB_DIR, CACHE_DIR, NUM_TO_QUARTER, NUM_TO_CAMPUS from .fhda_utils import clean_course_name_str ENABLE_ADVANCED = True @@ -39,6 +43,29 @@ def clean_dept_name(name: str): return re.sub(r'^(.*\w)-[FHDA]{2}$', r'\1', name) +def get_term_info(term): + year = int(term[0:4]) + quarter_num = int(term[4]) + quarter = NUM_TO_QUARTER[quarter_num] + term_campus = NUM_TO_CAMPUS[int(term[5])] + + if quarter_num < 3: + # If the quarter is summer or fall, then the year should be incremented + # Ex. Fall 2020 => 20212X + year -= 1 + + return year, quarter, term_campus + + +def load_db(term, tag, campus, readonly=False): + db_path = join(DB_DIR, f'{tag}_{term}_database.json') + + if readonly: + return TinyDB(db_path, access_mode='r', storage=CachingMiddleware(JSONStorage)) + else: + return TinyDB(db_path) + + class FHDAScraperHooks(BaseHooks): @staticmethod def transform_depts(depts): @@ -105,10 +132,10 @@ def transform_class(class_data): hooks=FHDAScraperHooks, login=login, - max_terms=4, + max_terms=8, # use_cache=False, # start_term='202042', - trace=True, + # trace=True, ) scraper.run() @@ -127,10 +154,10 @@ def transform_class(class_data): hooks=FHDAScraperHooks, # login=login, - max_terms=4, + max_terms=8, # use_cache=False, - # start_term='202042', - trace=True, + # start_term='202111', + # trace=True, ) scraper.run() @@ -151,3 +178,6 @@ def transform_class(class_data): with open(join(DB_DIR, 'metadata.json'), 'w') as outfile: json.dump({'tags': dict(tagdbs), 'terms': dict(termdbs)}, outfile) + + db = TinyDB(join(DB_DIR, 'multi_database.json')) + postprocess_dbs(db, termdbs, get_term_info=get_term_info, load_db=load_db) diff --git a/campus/fhda/fhda_settings.py b/campus/fhda/fhda_settings.py index 363b1fa..ea1f547 100644 --- a/campus/fhda/fhda_settings.py +++ b/campus/fhda/fhda_settings.py @@ -16,6 +16,21 @@ # Available Campuses - Foothill, De Anza, and test CAMPUS_LIST = {'fh': '202121', 'da': '202122', 'test': 'test'} +# Cool stuff +QUARTER_TO_NUM = { + 'summer': 1, + 'fall': 2, + 'winter': 3, + 'spring': 4 +} +NUM_TO_QUARTER = {v: k for k, v in QUARTER_TO_NUM.items()} + +CAMPUS_TO_NUM = { + 'fh': 1, + 'da': 2 +} +NUM_TO_CAMPUS = {v: k for k, v in CAMPUS_TO_NUM.items()} + ''' Course Type Flags - Foothill College diff --git a/campus/wvm/wvm_campus.py b/campus/wvm/wvm_campus.py index 07534e1..050b840 100644 --- a/campus/wvm/wvm_campus.py +++ b/campus/wvm/wvm_campus.py @@ -38,6 +38,9 @@ def load_db(self, campus, year, quarter): return db + def load_multi_db(self, campus): + return TinyDB(join(DB_DIR, f'multi_{self.CAMPUS_TO_PREFIX[campus]}_database.json'), access_mode='r') + def list_dbs(self, campus): with open(join(DB_DIR, 'metadata.json'), 'r') as file: metadata = json.loads(file.read()) diff --git a/campus/wvm/wvm_scrape.py b/campus/wvm/wvm_scrape.py index 2e673e8..acb2c4c 100644 --- a/campus/wvm/wvm_scrape.py +++ b/campus/wvm/wvm_scrape.py @@ -2,14 +2,20 @@ import json from os.path import join from copy import deepcopy +from collections import defaultdict + +from tinydb import TinyDB, where +from tinydb.storages import JSONStorage +from tinydb.middlewares import CachingMiddleware from logger import log_err from data.utils import list_dbs from scraper.ssb_base import BaseHooks from scraper.ssb_auth_schedule import AdvancedScraper from scraper.ssb_public_schedule import ScheduleScraper +from scraper.postprocess import postprocess_dbs -from .wvm_settings import SSB_URL, DB_DIR, CACHE_DIR +from .wvm_settings import SSB_URL, DB_DIR, CACHE_DIR, NUM_TO_QUARTER, PREFIX_TO_CAMPUS def clean_dept_name(name: str): @@ -20,6 +26,23 @@ def clean_dept_name(name: str): return re.sub(r'^(.*\w) ?- ?[WVMC]{2,3}$', r'\1', name) +def get_term_info(campus): + def get_info(term): + year = int(term[0:4]) + quarter = NUM_TO_QUARTER[int(term[4])] + return year, quarter, PREFIX_TO_CAMPUS[campus] + return get_info + + +def load_db(term, tag, campus, readonly=False): + db_path = join(DB_DIR, f'{tag}_{campus}_{term}_database.json') + + if readonly: + return TinyDB(db_path, access_mode='r', storage=CachingMiddleware(JSONStorage)) + else: + return TinyDB(db_path) + + class WVMScraperHooks(BaseHooks): @staticmethod def transform_depts(depts): @@ -57,7 +80,7 @@ def transform_class(class_data): # max_terms=4, # use_cache=False, # start_term='201231', - trace=True, + # trace=True, ) scraper.run() @@ -75,3 +98,12 @@ def transform_class(class_data): with open(join(DB_DIR, 'metadata.json'), 'w') as outfile: json.dump({'terms': termdbs}, outfile) + + ddd = defaultdict(lambda: defaultdict(list)) + + for info in termdbs: + ddd[info['campus']][info['code']] = ['sched'] + + for campus, term_dbs in ddd.items(): + db = TinyDB(join(DB_DIR, f'multi_{campus}_database.json')) + postprocess_dbs(db, term_dbs, campus=campus, get_term_info=get_term_info(campus), load_db=load_db) diff --git a/campus/wvm/wvm_settings.py b/campus/wvm/wvm_settings.py index 130133b..447ba2b 100644 --- a/campus/wvm/wvm_settings.py +++ b/campus/wvm/wvm_settings.py @@ -5,3 +5,16 @@ SSB_URL = 'https://ssb-prod.ec.wvm.edu/PROD/' DB_DIR = join(ROOT_DB_DIR, 'wvm') CACHE_DIR = join(ROOT_CACHE_DIR, 'wvm') + +QUARTER_TO_NUM = { + 'winter': 1, + 'spring': 3, + 'summer': 5, + 'fall': 7, +} +NUM_TO_QUARTER = {v: k for k, v in QUARTER_TO_NUM.items()} +CAMPUS_TO_PREFIX = { + 'wv': 'wvc', + 'mc': 'mc', +} +PREFIX_TO_CAMPUS = {v: k for k, v in CAMPUS_TO_PREFIX.items()} diff --git a/data/access.py b/data/access.py index be533c0..789c2bd 100644 --- a/data/access.py +++ b/data/access.py @@ -10,6 +10,16 @@ def load(self, campus, year, quarter): return ALL_CAMPUS[campus].load_db(campus, year, quarter) + def load_multi_db(self, campus): + self.validate_campus(campus) + + return ALL_CAMPUS[campus].load_multi_db(campus) + + def one_instructor(self, db, instructor): + return db.table('instructors').get( + where('pretty_id') == instructor + ) + def campus_info(self, campus): self.validate_campus(campus) diff --git a/data/models.py b/data/models.py index 92c309d..b4dd987 100644 --- a/data/models.py +++ b/data/models.py @@ -50,6 +50,8 @@ class ClassDataSchema(Schema): # Number of open seats # seats = fields.Int(required=True, min=0) seats = fields.Int(min=0) + # Number of filled / taken seats + seats_taken = fields.Int(min=0) # Number of open waitlist seats # wait_seats = fields.Int(required=True, min=0) wait_seats = fields.Int(min=0) @@ -98,7 +100,9 @@ class ClassTimeSchema(Schema): # time = fields.Str(required=True) start_time = fields.Str(required=True) end_time = fields.Str(required=True) - instructor = fields.Str(required=True) + # instructor = fields.Str(required=True) + # instructor = fields.List(fields.Str(), required=True) + instructor = fields.List(fields.Raw(), required=True) location = fields.Str(required=True) room = fields.Str() campus = fields.Str() diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index a62577a..f57ce19 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -5,6 +5,7 @@ import { Router, route } from 'preact-router' import CollegePage from './pages/CollegePage' import DeptPage from './pages/DeptPage' import CoursePage from './pages/CoursePage' +import InstructorPage from './pages/InstructorPage' import { PageNotFound, CampusNotFound, ErrorPage } from './components/NotFound' import { campus, PATH_PREFIX } from './data' import { TermYear, CampusInfo, useRootApi } from './state' @@ -80,6 +81,7 @@ export default function App() { + ) diff --git a/frontend/src/components/ClassTable.jsx b/frontend/src/components/ClassTable.jsx new file mode 100644 index 0000000..607af47 --- /dev/null +++ b/frontend/src/components/ClassTable.jsx @@ -0,0 +1,90 @@ +import { h, Fragment } from 'preact' +import Match from 'preact-router/match' + +const Link = Match.Link + +const replaceTBA = (text) => ( + text === 'TBA' + ? ( + (none) + ) + : text +) + +function ClassTimeCols({ time, campusId }) { + const timeString = time.start_time == 'TBA' + ? replaceTBA('TBA') + : `${time.start_time || '?'} - ${time.end_time || '?'}` + + // const instructors = (time.instructor || []) + // .map( + // ({ full_name, display_name, email }) => display_name || full_name + // ) + // .join(', ') + + const instructors = (time.instructor || []) + .map(({ full_name, display_name, email, pretty_id }, index, arr) => { + let name = display_name || full_name + + if (index < arr.length - 1) { + name += ', ' + } + + return pretty_id + ? {name} + : {name} + }) + .flat() + + return ( + <> + {time.type || '?'} + {instructors || '?'} + {replaceTBA(time.days || '?')} + {timeString || '?'} + {time.location || '?'} + + ) +} + +export default function ClassesTable({ campusId, headers, classes, getClassColumns }) { + if (!classes) return <> + + const tableRowEls = [] + + for (const section of classes) { + const numRows = section.times.length || 1 + const tableCols = getClassColumns(section) + + tableRowEls.push( + + {tableCols.map((name) => {name})} + + + ) + + for (const time of section.times.slice(1)) { + if (!time) continue + tableRowEls.push( + + + + ) + } + } + + return ( +
+ + + + {headers.map((name) => )} + + + + {tableRowEls} + +
{name}
+
+ ) +} diff --git a/frontend/src/pages/CoursePage.jsx b/frontend/src/pages/CoursePage.jsx index 7a80ecf..9375333 100644 --- a/frontend/src/pages/CoursePage.jsx +++ b/frontend/src/pages/CoursePage.jsx @@ -1,27 +1,13 @@ import { h, Fragment } from 'preact' -import { useState, useEffect } from 'preact/hooks' import { campus, PATH_PREFIX } from '../data' import { useApi } from '../state' +import { formatDate } from '../utils' import BreadCrumbs from '../components/BreadCrumbs' -import { CampusNotFound, DeptNotFound, CourseNotFound } from '../components/NotFound' +import ClassesTable from '../components/ClassTable' +import { CampusNotFound, CourseNotFound } from '../components/NotFound' -const opt = { year: 'numeric', month: 'short', day: 'numeric' } -const formatDate = (str) => new Date(Date.parse(str)).toLocaleDateString('en-US', opt) -const replaceTBA = (text) => text === 'TBA' ? (none) : text - -const displayTimes = (time) => { - const time_string = time.start_time == 'TBA' ? replaceTBA('TBA') : `${time.start_time} - ${time.end_time}` - return ( - <> - {time.instructor} - {replaceTBA(time.days)} - {time_string} - {time.location} - - ) - // ${time.room} -} +const dateFormatOpts = { year: 'numeric', month: 'short', day: 'numeric' } export default function CoursePage({ college, dept, course }) { const colleged = campus.find((cmp) => cmp.id === college) @@ -29,47 +15,26 @@ export default function CoursePage({ college, dept, course }) { if (!colleged) return const [classes, error] = useApi(`/${college}/depts/${dept}/courses/${course}/classes`) - const row_els = [] - const first = (classes && classes[0]) || {} - const hasSeatInfo = classes && classes[0] ? (classes[0].status && classes[0].seats != undefined) : true - const headers = ['CRN', 'Start', 'End', ...(hasSeatInfo ? ['Status', 'Seats', 'Waitlist'] : []), 'Professor', 'Days', 'Time', 'Location'] - // Room - - if (classes) { - for (const section of classes) { - const start = formatDate(section.start) - const end = formatDate(section.end) - - const rows = section.times.length - const table_rows = [ - // section.CRN, - section.CRN.toString().padStart(5, '0'), - start, - end, - ...(hasSeatInfo ? [ - section.status, - section.seats, - section.wait_cap ? `${section.wait_seats}/${section.wait_cap}` : section.wait_seats - ] : []) - ] - - row_els.push( - - {table_rows.map((name) => {name})} - {displayTimes(section.times[0])} - - ) - - for (const time of section.times.slice(1)) { - row_els.push( - - {displayTimes(time)} - - ) - } - } - } + const first = classes && classes[0] + const hasSeatInfo = first ? (first.status && first.seats != undefined) : false + const headers = [ + 'CRN', + 'Start', + 'End', + ...( + hasSeatInfo ? [ + 'Status', + 'Seats', + 'Waitlist' + ] : [] + ), + 'Type', + 'Professor', + 'Days', + 'Time', + 'Location', + ] const crumbs = [ { url: '/', name: 'Home' }, { url: `${PATH_PREFIX}/${college}`, name: colleged.name }, @@ -84,17 +49,29 @@ export default function CoursePage({ college, dept, course }) { } else { content = ( <> -

{first.title}  ·  {first.units} units

-
- - - - {headers.map((name) => )} - - - {row_els} -
{name}
-
+

+ {(first && first.title) || ''}  ·  {first ? first.units : 'X'} units +

+ { + const start = formatDate(section.start, dateFormatOpts) + const end = formatDate(section.end, dateFormatOpts) + + return [ + section.CRN.toString().padStart(5, '0'), + start, + end, + ...(hasSeatInfo ? [ + section.status, + section.seats, + section.wait_cap ? `${section.wait_seats}/${section.wait_cap}` : section.wait_seats + ] : []) + ] + }} + /> ) } diff --git a/frontend/src/pages/DeptPage.jsx b/frontend/src/pages/DeptPage.jsx index 5303213..c2bc923 100644 --- a/frontend/src/pages/DeptPage.jsx +++ b/frontend/src/pages/DeptPage.jsx @@ -4,29 +4,14 @@ import { route } from 'preact-router' import matchSorter from 'match-sorter' import { campus, PATH_PREFIX } from '../data' -import { setIntersection } from '../utils' +import { setIntersection, formatDate } from '../utils' import { useApi } from '../state' import { CampusNotFound, DeptNotFound } from '../components/NotFound' import Header from '../components/Header' +import ClassesTable from '../components/ClassTable' import BreadCrumbs from '../components/BreadCrumbs' -// const opt = { year: 'numeric', month: 'short', day: 'numeric' } -const opt = { month: 'short', day: 'numeric' } -const formatDate = (str) => new Date(Date.parse(str)).toLocaleDateString('en-US', opt) -const replaceTBA = (text) => text === 'TBA' ? (none) : text - -const displayTimes = (time) => { - const time_string = time.start_time == 'TBA' ? replaceTBA('TBA') : `${time.start_time} - ${time.end_time}` - return ( - <> - {time.instructor} - {replaceTBA(time.days)} - {time_string} - {time.location} - - ) - // ${time.room} -} +const dateFormatOpts = { month: 'short', day: 'numeric' } // function DeptCard({ id, name, count, subinfo, setDept }) { function DeptCard({ id, name, dept, course, title, count, subinfo, setDept }) { @@ -58,9 +43,16 @@ export default function DeptPage({ college, dept, setCourse }) { useEffect(() => { if (courses && classes) { + const getInstructors = item => item.times + .map(time => ( + (time.instructor || []).map( + ({ full_name, display_name }) => display_name || full_name + )) + ) + .join(' ') const filteredClasses = matchSorter(classes, query, { keys: [ - {minRanking: matchSorter.rankings.MATCHES, key: item => item.times.map(time => time.instructor).join(',')}, + {minRanking: matchSorter.rankings.MATCHES, key: getInstructors }, {threshold: matchSorter.rankings.EQUAL, key: 'course'}, {threshold: matchSorter.rankings.CONTAINS, key: 'title'}, item => item.dept + ' ' + item.course, @@ -102,46 +94,25 @@ export default function DeptPage({ college, dept, setCourse }) { // const view = 'card-view' const hasSeatInfo = classes && classes[0] ? (classes[0].status && classes[0].seats != undefined) : true - const headers = ['CRN', 'Course', 'Title', 'Dates', ...(hasSeatInfo ? ['Status', 'Seats', 'Waitlist'] : []), 'Professor', 'Days', 'Time', 'Location'] - const row_els = [] - const postFilterClasses = (query && filteredClasses) || classes - if (postFilterClasses && postFilterClasses.length) { - for (const section of postFilterClasses) { - const start = formatDate(section.start) - const end = formatDate(section.end) - - const rows = section.times.length - const table_rows = [ - section.CRN.toString().padStart(5, '0'), - `${section.dept} ${section.course}`, - `${section.title}`, - `${start} - ${end}`, - // end, - ...(hasSeatInfo ? [ - section.status, - section.seats, - section.wait_cap ? `${section.wait_seats}/${section.wait_cap}` : section.wait_seats - ] : []) - ] - - row_els.push( - - {table_rows.map((name) => {name})} - {displayTimes(section.times[0])} - - ) - - for (const time of section.times.slice(1)) { - row_els.push( - - {displayTimes(time)} - - ) - } - } - } - + const headers = [ + 'CRN', + 'Course', + 'Title', + 'Dates', + ...( + hasSeatInfo ? [ + 'Status', + 'Seats', + 'Waitlist' + ] : [] + ), + 'Type', + 'Professor', + 'Days', + 'Time', + 'Location', + ] const crumbs = [ { url: '/', name: 'Home' }, { url: `${PATH_PREFIX}/${college}${window.location.search}`, name: colleged.name }, @@ -158,18 +129,27 @@ export default function DeptPage({ college, dept, setCourse }) {

Courses

{cards}

All Classes

-
- - - - {headers.map((name) => )} - - - - {row_els} - -
{name}
-
+ { + const start = formatDate(section.start, dateFormatOpts) + const end = formatDate(section.end, dateFormatOpts) + + return [ + section.CRN.toString().padStart(5, '0'), + `${section.dept} ${section.course}`, + `${section.title}`, + `${start} - ${end}`, + ...(hasSeatInfo ? [ + section.status, + section.seats, + section.wait_cap ? `${section.wait_seats}/${section.wait_cap}` : section.wait_seats + ] : []) + ] + }} + /> ) } diff --git a/frontend/src/pages/InstructorPage.jsx b/frontend/src/pages/InstructorPage.jsx new file mode 100644 index 0000000..db8a8f1 --- /dev/null +++ b/frontend/src/pages/InstructorPage.jsx @@ -0,0 +1,74 @@ +import { h, Fragment } from 'preact' +import Match from 'preact-router' + +import { campus, PATH_PREFIX } from '../data' +import { useApi } from '../state' +import { CampusNotFound } from '../components/NotFound' +import BreadCrumbs from '../components/BreadCrumbs' + +const Link = Match.Link +const firstCharUpper = (str) => str.charAt(0).toUpperCase() + str.substring(1) + +export default function InstructorPage({ college, id }) { + const colleged = campus.find((cmp) => cmp.id === college) + + if (!colleged) return + + const [instructor, error] = useApi(`/${college}/instructors/${id}`) + const crumbs = [ + { url: '/', name: 'Home' }, + { url: `${PATH_PREFIX}/${college}${window.location.search}`, name: colleged.name }, + { url: `${PATH_PREFIX}/${college}/instructor/${id}${window.location.search}`, name: id }, + ] + const groupedClasses = {} + + if (instructor) { + instructor.classes.sort((a, b) => b.term_code - a.term_code).map(({ term, year, campus: campusId, dept, course, seats_taken }) => { + const title = firstCharUpper(`${term} ${year}`) + + if (!groupedClasses[title]) groupedClasses[title] = [] + + let courseName = `${dept} ${course}` + // let text = `Taught ${dept} ${course}` + let text = ` at ${campus.find(cmp => cmp.id == campusId).name}` + + if (seats_taken != undefined && seats_taken != null) text += ` to ${seats_taken} students` + + groupedClasses[title].push( +
+ Taught + {courseName} + {text} +
+ ) + }) + } + + return ( + error == 'NOT_FOUND' ? ( + + ) : ( +
+ +
+

{instructor ? instructor.display_name || instructor.full_name : id} @ {colleged.name}

+ {/*
*/} + {/*
*/} +
+ {instructor && instructor.email && ( + + )} +
+ {Object.entries(groupedClasses).map(([title, els]) => ( + <> +

{title}

+ {els} + + ))} +
+ {/*

{email} @ {colleged.name}

+
{cards}
*/} +
+ ) + ) +} diff --git a/frontend/src/utils.js b/frontend/src/utils.js index 4df63c5..eed9712 100644 --- a/frontend/src/utils.js +++ b/frontend/src/utils.js @@ -1,5 +1,5 @@ -function setIntersection(setA, setB) { +export const formatDate = (str, opt) => new Date(Date.parse(str)).toLocaleDateString('en-US', opt) + +export function setIntersection(setA, setB) { return new Set([...setA].filter(x => setB.has(x))) } - -export { setIntersection } diff --git a/merge.py b/merge.py deleted file mode 100644 index f033d56..0000000 --- a/merge.py +++ /dev/null @@ -1,75 +0,0 @@ -from tinydb import TinyDB - -from logger import log_warn, log_err - - -def merge_dicts(cl1, cl2, allowed): - target = {} - CRN = cl1.get('CRN') or 'time' - - def loop_on_keys(keys): - for key in keys: - if key == 'times': - target['times'] = [] - - times1 = cl1.pop(key) - times2 = cl2.pop(key) - - if len(times1) != len(times2): - log_err(f'"{key}" is different for class {CRN}: "{len(times1)}" vs. "{len(times2)}"') - continue - - for idx in range(len(times1)): - time1 = times1[idx].copy() - time2 = times2[idx].copy() - - target['times'].append(merge_dicts(time1, time2, allowed=allowed)) - - continue - - val1 = cl1.pop(key, None) - val2 = cl2.pop(key, None) - - if val1 != None and val2 != None: - if val1 != val2 and key not in allowed: - log_warn(f'"{key}" is different for class {CRN}: "{val1}" vs. "{val2}"') - target[key] = val1 - else: - target[key] = val1 if val1 != None else val2 - - loop_on_keys(list(cl1.keys())) - loop_on_keys(list(cl2.keys())) - - return target - - -def merge_dbs(final: TinyDB, first: TinyDB, second: TinyDB, allowed): - classes1 = {doc['CRN']: doc for doc in first.table('classes').all()} - classes2 = {doc['CRN']: doc for doc in second.table('classes').all()} - - classes = [] - - for CRN in classes1.keys(): - cl1 = classes1[CRN].copy() - cl2 = classes2.get(CRN) - - if cl2: - classes.append(merge_dicts(cl1, cl2.copy(), allowed=allowed)) - else: - log_err(f'Class {CRN} was only found in one DB!') - classes.append(cl1) - - - final.drop_tables() - final.table('classes').insert_multiple(classes) - - -ALLOWED_ONE = ['title', 'instructor', 'seats', 'status', 'wait_seats', 'wait_cap'] - -if __name__ == '__main__': - target = TinyDB('db/fhda/merge_202121_database.json') - db1 = TinyDB('db/fhda/new_202121_database.json') - # db2 = TinyDB('db/fhda/sched_202121_database.json') - db2 = TinyDB('db/fhda/202121_database.json') - - merge_dbs(target, db1, db2, allowed=ALLOWED_ONE) diff --git a/scraper/merge.py b/scraper/merge.py new file mode 100644 index 0000000..04205e3 --- /dev/null +++ b/scraper/merge.py @@ -0,0 +1,94 @@ +from tinydb import TinyDB + +from logger import log_warn, log_err + +CONFIGS = { + ('auth_sched', 'public_sched'): { + 'allowed': ['location', 'instructor'], + 'preference': { + 'location': 1, + 'instructor': 1 + } + }, + ('auth_sched', 'fhda_term'): { + 'allowed': ['title', 'instructor', 'seats', 'status', 'wait_seats', 'wait_cap'], + 'preference': {} + } +} + + +def merge_dicts(cl1, cl2, allowed, preference): + target = {} + CRN = cl1.get('CRN') or 'time' + + def loop_on_keys(keys): + for key in keys: + if key == 'times': + target['times'] = [] + + times1 = cl1.pop(key) + times2 = cl2.pop(key) + + if len(times1) != len(times2): + log_err(f'"{key}" is different for class {CRN}: "{len(times1)}" vs. "{len(times2)}"') + continue + + for idx in range(len(times1)): + time1 = times1[idx].copy() + time2 = times2[idx].copy() + + target['times'].append(merge_dicts(time1, time2, allowed=allowed, preference=preference)) + + continue + + val1 = cl1.pop(key, None) + val2 = cl2.pop(key, None) + + if val1 != None and val2 != None: + if val1 != val2 and key not in allowed: + log_warn(f'"{key}" is different for class {CRN}: "{val1}" vs. "{val2}"') + + if preference.get(key): + target[key] = [val1, val2][preference[key]] + else: + if val1 != val2: + log_warn(f'Preference not given for "{key}"') + target[key] = val1 + + else: + target[key] = val1 if val1 != None else val2 + + loop_on_keys(list(cl1.keys())) + loop_on_keys(list(cl2.keys())) + + return target + + +def merge_dbs(final: TinyDB, first: TinyDB, second: TinyDB, allowed, preference): + classes1 = {doc['CRN']: doc for doc in first.table('classes').all()} + classes2 = {doc['CRN']: doc for doc in second.table('classes').all()} + + classes = [] + + for CRN in classes1.keys(): + cl1 = classes1[CRN].copy() + cl2 = classes2.get(CRN) + + if cl2: + classes.append(merge_dicts(cl1, cl2.copy(), allowed=allowed, preference=preference)) + else: + log_err(f'Class {CRN} was only found in one DB!') + classes.append(cl1) + + final.drop_tables() + final.table('departments').insert_multiple(first.table('departments').all()) + final.table('courses').insert_multiple(first.table('courses').all()) + final.table('classes').insert_multiple(classes) + + +def merge(config_name: tuple, target: TinyDB, first: TinyDB, second: TinyDB): + config = CONFIGS.get(config_name) + if config: + merge_dbs(target, first, second, config['allowed'], config['preference']) + else: + raise NotImplementedError diff --git a/scraper/postprocess.py b/scraper/postprocess.py new file mode 100644 index 0000000..bb19422 --- /dev/null +++ b/scraper/postprocess.py @@ -0,0 +1,73 @@ +from collections import defaultdict + +from tinydb import TinyDB +from tinydb.storages import JSONStorage +from tinydb.middlewares import CachingMiddleware + +from logger import log, log_info, log_err, log_warn +from scraper.merge import merge + +def postprocess_dbs(final_db: TinyDB, term_dbs, get_term_info, load_db, campus=None): + all_instr = {} + + for term, tags in sorted(term_dbs.items(), key=lambda item: item[0], reverse=True): + all_instr_crns = defaultdict(set) + + if 'sched' in tags: + db = load_db(term, 'sched', campus=campus, readonly=True) + + if 'new' in tags: + otherdb = load_db(term, 'new', campus=campus, readonly=True) + + # Merge DBs generated by the auth and public schedule scrapers + target = load_db(term, 'merge', campus=campus, readonly=False) + + log(term, 'magenta', 'Merging DBs...') + merge(('auth_sched', 'public_sched'), target, otherdb, db) + else: + otherdb = None + target = None + + classes = (target if target != None else db).table('classes').all() + + year, pretty_term, campus_id = get_term_info(term) + + for the_class in classes: + if not the_class.get('times'): + log_warn(f'Class with CRN "{the_class.get("CRN")}" in term "{term}" does not have times!') + continue + + for time in the_class['times']: + for instructor in time['instructor']: + if isinstance(instructor, str) : continue + + instructor_id = instructor.get('id') + instructor_pretty_id = instructor.get('pretty_id') + + if instructor_id: + if not all_instr.get(instructor_id): + all_instr[instructor_id] = {**instructor} + all_instr[instructor_id]['classes'] = [] + + if the_class['CRN'] in all_instr_crns[instructor_id]: + continue + + partial_class = { + 'term_code': term, + 'year': year, + 'term': pretty_term, + 'campus': campus_id, + 'CRN': the_class['CRN'], + 'dept': the_class['dept'], + 'course': the_class['course'], + 'title': the_class['title'], + } + + if the_class.get('seats_taken') != None: + partial_class['seats_taken'] = the_class['seats_taken'] + + all_instr[instructor_id]['classes'].append(partial_class) + all_instr_crns[instructor_id].add(the_class['CRN']) + + final_db.drop_table('instructors') + final_db.table('instructors').insert_multiple(all_instr.values()) diff --git a/scraper/ssb_auth_schedule.py b/scraper/ssb_auth_schedule.py index 7ee3032..907ec39 100644 --- a/scraper/ssb_auth_schedule.py +++ b/scraper/ssb_auth_schedule.py @@ -2,14 +2,17 @@ from bs4 import BeautifulSoup -from .ssb_base import BaseSSBScraper, SOUP_PARSER +from .ssb_base import BaseSSBScraper, BaseHooks, SOUP_PARSER -def parse_class_time(data): +def parse_class_time(data, hooks: BaseHooks): + instructors = data.get('Instructor') + instructors = [{'full_name': hooks.clean_instructor_name(name)} for name in instructors.split(',')] if instructors else [] + converted = { 'days': data.get('Days'), 'time': data.get('Time'), - 'instructor': data.get('Instructor'), + 'instructor': instructors, 'location': data.get('Location') or 'TBA', # 'room': data.get('Location'), # 'campus': data.get('Cmp'), @@ -44,6 +47,7 @@ def parse_class_data(data): converted['units'] = data.get('Cred') converted['seats'] = data.get('Rem') + converted['seats_taken'] = data.get('Act') converted['wait_seats'] = data.get('WL Rem') # converted['wait_cap'] = 0 @@ -149,7 +153,7 @@ def magic_clean(els): # if not is_first_row_for_class and data.get('Cmp') is None and last_class_time: # data['Cmp'] = last_class_time['campus'] - class_time_data = last_class_time = parse_class_time(data) + class_time_data = last_class_time = parse_class_time(data, self.hooks) if is_first_row_for_class: class_data = last_class = parse_class_data(data) diff --git a/scraper/ssb_base.py b/scraper/ssb_base.py index 0f1a874..86dd4ae 100644 --- a/scraper/ssb_base.py +++ b/scraper/ssb_base.py @@ -1,3 +1,4 @@ +import re from os import makedirs from os.path import join, exists from collections import defaultdict @@ -8,7 +9,7 @@ from tinydb import TinyDB from marshmallow import ValidationError as MarshValidationError -from logger import log, log_info, log_err, log_trace +from logger import log, log_info, log_warn, log_trace from data.models import classDataSchema, classTimeSchema SOUP_PARSER = 'lxml' @@ -42,6 +43,11 @@ def clean_units_str(units_str): else: return units_str + @staticmethod + def clean_instructor_name(name): + # Replace ', ' with '', '(P)' with '', ' ' (n spaces) with ' ' (one space) + return re.sub(r'\s+', ' ', re.sub(r'(?:, )|(?:\(\w?\))', '', name)).strip() + class BaseSSBScraper: PREFIX = '' @@ -186,7 +192,7 @@ def save_classes(self, db, depts, classes): course_classes.append(data['CRN']) if len(course_titles) > 1: - log_err(f'Multiple course titles for "{dept} {course}" {str(course_titles)}') + log_warn(f'Multiple course titles for "{dept} {course}" {str(course_titles)}') db_courses.append({ 'dept': dept, diff --git a/scraper/ssb_public_schedule.py b/scraper/ssb_public_schedule.py index 3ead649..60fac6d 100644 --- a/scraper/ssb_public_schedule.py +++ b/scraper/ssb_public_schedule.py @@ -1,17 +1,11 @@ from collections import defaultdict from datetime import datetime +from hashlib import sha256, sha224 from bs4 import BeautifulSoup from .ssb_base import BaseSSBScraper, SOUP_PARSER -# CACHE_DIR = join(DB_DIR, '.cache', 'scrape_schedule') -# CACHE_DIR = join(DB_DIR, '.cache', 'scrape_schedule', 'western_colorada') -# CACHE_DIR = join(DB_DIR, '.cache', 'scrape_schedule', 'tennessee_knoxville') -# CACHE_DIR = join(DB_DIR, '.cache', 'scrape_schedule', 'west_valley_mission') -# SOUP_PARSER = 'lxml' -# SOUP_PARSER = 'html5lib' - class ScheduleScraper(BaseSSBScraper): PREFIX = 'sched_' @@ -156,6 +150,50 @@ def parse_inner_table(self, table): data = dict(zip(table_headers, data_cols)) dates = data.get('Date Range') + instr_td = tds[table_headers.index('Instructors')] + instructors = [] + last_name = '' + + def add_partial_last(): + if last_name: + full_name = self.hooks.clean_instructor_name(last_name) + pretty_id = full_name.lower().replace(' ', '-') + instructors.append({ + 'id': sha224(pretty_id.encode()).hexdigest(), + 'pretty_id': pretty_id, + 'full_name': full_name + }) + + for node in instr_td.contents: + if isinstance(node, str): + if node.strip().startswith(','): + add_partial_last() + last_name = node + else: + last_name += node + else: + if node.name == 'a': + full_name = self.hooks.clean_instructor_name(last_name) + email = node.get('href').replace('mailto:', '').strip() + + instructors.append({ + 'id': sha224(email.encode()).hexdigest(), + 'pretty_id': full_name.lower().replace(' ', '-'), + 'full_name': full_name, + 'display_name': node.get('target').strip(), + 'email': email + }) + last_name = '' + + elif node.name == 'abbr': + last_name += node.get_text() + pass + + else: + print('idk what this is', node) + + add_partial_last() + if not dates or dates == 'TBA': start = 'TBA' end = 'TBA' @@ -173,8 +211,9 @@ def parse_inner_table(self, table): 'type': data.get('Type'), 'days': data.get('Days'), 'time': data.get('Time'), - 'instructor': data.get('Instructors'), + 'instructor': instructors, 'location': data.get('Where') or 'TBA', + # 'instructor': data.get('Instructors'), # 'room': data.get('Where').split(' ')[-1], # 'campus': campus, diff --git a/server.py b/server.py index cb95293..d1636e5 100644 --- a/server.py +++ b/server.py @@ -64,6 +64,33 @@ def api(campus, *args, **kwargs): return decorator +def campus_multi_term_api(path: str, methods=None): + def decorator(func): + @application.route(f'//{path}', methods=(methods or ['GET'])) + @wraps(func) + def api(campus, *args, **kwargs): + try: + try: + db = database.load_multi_db(campus) + ret = func(db, *args, **kwargs) + except FileNotFoundError: + raise ApiError( + 404, + 'Data for requested campus does not exist.' + ) + + if ret is None or (isinstance(ret, list) and len(ret) == 0): + raise ApiError(404, 'No results') + + except ApiError as e: + return jsonify({'error': e.message}), e.status + + return jsonify(ret), 200 + + return api + return decorator + + @application.route('/') def api_campus(campus): try: @@ -74,6 +101,11 @@ def api_campus(campus): return jsonify(ret), 200 +@campus_multi_term_api('instructors/') +def api_one_instructor(db, instructor): + return database.one_instructor(db, instructor) + + @campus_api('courses') def api_courses(db): return database.all_courses(db)