MarzooqN / degree-planner

0 stars 0 forks source link

Scrape requirements from degree audit #55

Open lukew3 opened 2 months ago

lukew3 commented 2 months ago

Working on a way to get course requirement data from degree audit. Trying to use js to scrape because it's native and familiar to the web. Here's a script that you can paste into your browser developer console when viewing a degree audit page (uncommenting the download line at the end will allow you to save the file as a json file on your computer.

function download(filename, text) {
    var element = document.createElement('a');
    element.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(text));
    element.setAttribute('download', filename);
    element.style.display = 'none';
    document.body.appendChild(element);
    element.click();
    document.body.removeChild(element);
}

function scrape() {
    // Return a list of subrequirements where each subrequirement has their data formatted nicely into a json object with title, needs, and a course options list
    return Array.from(document.querySelectorAll('.subreqBody')).map(e => {
        const title = e.querySelector('.subreqTitle')?.textContent.replaceAll('\n', '').replaceAll('*', '').trim() || '';
        const needs = e.querySelector('.subreqNeeds')?.textContent.replaceAll('\n', '').replaceAll('\t\t\t', ' ').replaceAll('\t', '').trim() || '';
        const fromcourselist = e.querySelector('.fromcourselist')?.childNodes || [];
        const list = Array.from(fromcourselist)?.map(e2 => {
            try {
                return `${e2.getAttribute('department').trim()} ${e2.getAttribute('number').trim()}`;
            } catch {
                return;
            }
        }).filter(e2 => e2 !== undefined)
        return {
            title,
            needs,
            list
        };
    });
}

const results = scrape();
console.log(results);
// download('degreeAudit.json', JSON.stringify(results, null, 4))

The results aren't exactly what we need, but I'm not sure how much better we can get:

[
    {
        "title": "A MINIMUM 2.0 GPA IS REQUIRED IN ALL UNIVERSITY COURSES",
        "needs": "",
        "list": []
    },
    {
        "title": "AT LEAST 30 O.S.U. HOURS REQUIRED FOR GRADUATION",
        "needs": "",
        "list": []
    },
    {
        "title": "AT LEAST 106 HOURS WHICH ARE NOT TAKEN PASS/NONPASS",
        "needs": "",
        "list": []
    },
    {
        "title": "OTHER COURSES COUNTING TOWARD GRADUATION",
        "needs": "",
        "list": []
    },
    {
        "title": "TERM GRADE POINT AVERAGE - ALL COURSES",
        "needs": "",
        "list": []
    },
    {
        "title": "TERM GRADE POINT AVERAGE IN MAJOR",
        "needs": "",
        "list": []
    },
    {
        "title": "DEPARTMENT GRADE POINT AVERAGE (DPHR) IN CSE",
        "needs": "",
        "list": []
    },
    {
        "title": "",
        "needs": "NEEDS:    27.00    HOURS",
        "list": []
    },
    {
        "title": "SURVEY COURSE",
        "needs": "",
        "list": [
            "ENGR 1100"
        ]
    },
    {
        "title": "INTRO TO ENGINEERING - CHOOSE TWO COURSES",
        "needs": "NEEDS:    2    COURSES",
        "list": [
            "ENGR 1181",
            "ENGR 1281",
            "ENGR 1182",
            "ENGR 1282"
        ]
    },
    {
        "title": "INTRO TO ENGINEERING (TRANSFER) - TAKE THREE COURSES",
        "needs": "NEEDS:    3    COURSES",
        "list": [
            "ENGR 1186",
            "ENGR 1182",
            "ENGR 1187",
            "ENGR 1181",
            "ENGR 1188",
            "ENGR 1182"
        ]
    },
    {
        "title": "ENGINEERING CALCULUS I - CHOOSE A SINGLE COURSE:\t\t\t\t\t\t\t\t\t\t\t\tMATH 1151, 1156, 1161 OR COMBO: MATH 1140 & 1141",
        "needs": "",
        "list": []
    },
    {
        "title": "ENGINEERING CALC I REQUIREMENT",
        "needs": "",
        "list": []
    },
    {
        "title": "ENGINEERING CALCULUS II - CHOOSE ONE OPTION:\t\t\t\t\t\t\t\t\t\t\t\tMATH 1172, OR CALC II AND CALC III COMBINED.\t\t\t\t\t\t\t\t\t\t\t\t CALC II OPTIONS: MATH 1152, 1181H, 4181H\t\t\t\t\t\t\t\t\t\t\t\t CALC III OPTIONS: MATH 2153, 2162, 2182H, 4182H",
        "needs": "",
        "list": []
    },
    {
        "title": "PHYSICS I - TAKE THIS COURSE",
        "needs": "",
        "list": [
            "PHYSICS 1250"
        ]
    },
    {
        "title": "ELECTRONICS - TAKE THIS COURSE",
        "needs": "",
        "list": [
            "ECE 2360"
        ]
    },
    {
        "title": "INTRO TO DIGITAL LOGIC - TAKE THIS COURSE",
        "needs": "",
        "list": [
            "ECE 2060"
        ]
    },
    {
        "title": "LINEAR ALGEBRA - TAKE THIS COURSE",
        "needs": "",
        "list": [
            "MATH 2568"
        ]
    },
    {
        "title": "FOUNDATIONS OF HIGHER MATHEMATICS - TAKE THIS COURSE",
        "needs": "",
        "list": [
            "MATH 3345"
        ]
    },
    {
        "title": "INTRO TO PROBABILITY & STATISTICS FOR ENGINEERS - TAKE\t\t\t\t\t\t\t\t\t\t\t\tONE COURSE",
        "needs": "",
        "list": [
            "STAT 3470.01",
            "STAT 3470.02"
        ]
    },
    {
        "title": "MATH AND SCIENCE ELECTIVES (CSE)\t\t\t\t\t\t\t\t\t\t\t\t CHOOSE A MINIMUM OF 8 HOURS FROM BELOW",
        "needs": "NEEDS:    5.00    HOURS",
        "list": [
            "ANTHROP 2200",
            "BIOLOGY 1113.01",
            "BIOLOGY 1113.02",
            "BIOLOGY 1113E",
            "BIOLOGY 1113H",
            "BIOLOGY 1114.01",
            "BIOLOGY 1114.02"
        ]
    },
    {
        "title": "COMPUTER SCIENCE CORE - TAKE ALL SEVEN COURSES",
        "needs": "NEEDS:    7    COURSES",
        "list": [
            "CSE 2221",
            "CSE 2231",
            "CSE 2321",
            "CSE 2331",
            "CSE 2421",
            "CSE 2431",
            "CSE 3341"
        ]
    },
    {
        "title": "ETHICS IN COMPUTING - TAKE THIS COURSE",
        "needs": "",
        "list": [
            "CSE 2501"
        ]
    },
    {
        "title": "CSE SENIOR CAPSTONE DESIGN - CHOOSE ONE COURSE",
        "needs": "",
        "list": [
            "CSE 5911",
            "CSE 5912",
            "CSE 5913",
            "CSE 5914",
            "CSE 5915"
        ]
    },
    {
        "title": "CSE CORE CHOICES I - TAKE THIS COURSE",
        "needs": "",
        "list": [
            "CSE 3521"
        ]
    },
    {
        "title": "CSE CORE CHOICES II - TAKE 3 COURSES : CHOOSE 1 COURSE\t\t\t\t\t\t\t\t\t\t\t\tFROM EACH SET",
        "needs": "NEEDS:    3    COURSES",
        "list": [
            "CSE 3901",
            "CSE 3902",
            "CSE 3903",
            "CSE 3231",
            "CSE 3241",
            "CSE 3421",
            "CSE 3461"
        ]
    },
    {
        "title": "COMPLETE 17 TECHNICAL ELECTIVE HOURS FROM THE FOLLOWI\t\t\t\t\t\t\t\t\t\t\t\t1.) & 2.) CSE TECHNICAL ELECTIVES - A MINIMUM OF 9 CSE\t\t\t\t\t\t\t\t\t\t\t\tHOURS ARE REQUIRED\t\t\t\t\t\t\t\t\t\t\t\t3.) NON-CSE TECHNICAL ELECTIVES - UP TO 8 HOURS MAY BE\t\t\t\t\t\t\t\t\t\t\t\tUSED OR COMPLETE ONE OF THE BELOW LISTED MINORS.",
        "needs": "",
        "list": []
    },
    {
        "title": "1A.) REQUIRED COURSE - CHOOSE 1 COURSE",
        "needs": "NEEDS:    1    COURSE",
        "list": [
            "CSE 5243",
            "CSE 5523",
            "CSE 5524",
            "CSE 5525",
            "CSE 5526"
        ]
    },
    {
        "title": "1B.) REQUIRED COURSE - TAKE CSE 5914\t\t\t\t\t\t\t\t\t\t\t\tNOTE: HOURS FROM CSE 5914 DO NOT COUNT TOWARDS 17 HOUR\t\t\t\t\t\t\t\t\t\t\t\tREQUIREMENT.",
        "needs": "",
        "list": [
            "CSE 5914"
        ]
    },
    {
        "title": "2.) CSE TECHNICAL ELECTIVES - COMPLETE 6 HOURS FROM CSE\t\t\t\t\t\t\t\t\t\t\t\t3000-5999, PSYCH 5612\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tRESTRICTIONS:\t\t\t\t\t\t\t\t\t\t\t\t425X - A MAXIMUM OF 2 HOURS MAY COUNT\t\t\t\t\t\t\t\t\t\t\t\t4193 - A MAXIMUM OF 2 HOURS MAY COUNT\t\t\t\t\t\t\t\t\t\t\t\t4998 - A MAXIMUM OF 3 HOURS MAY COUNT\t\t\t\t\t\t\t\t\t\t\t\t4999 - A MAXIMUM OF 6 HOURS MAY COUNT\t\t\t\t\t\t\t\t\t\t\t\t(4193+4998+4999) - A MAXIMUM OF 6 HOURS MAY COUNT FROM\t\t\t\t\t\t\t\t\t\t\t\tALL THREE COURSE NUMBERS",
        "needs": "",
        "list": []
    },
    {
        "title": "3.) NON-CSE TECHNICAL ELECTIVES (MAX 8 HOURS MAY COUNT)",
        "needs": "",
        "list": [
            "ACCAD 3350",
            "ACCAD 4101",
            "ACCAD 5001",
            "ACCAD 5002",
            "ACCAD 5003",
            "ACCAD 5100",
            "ACCAD 5141",
            "ACCAD 5150",
            "ACCAD 5301",
            "ACCAD 5500",
            "ACCTMIS 2000",
            "ACCTMIS 2200",
            "ACCTMIS 2300"
        ]
    },
    {
        "title": "COMPLETE ONE OF THE FOLLOWING MINORS INSTEAD OF THE 8\t\t\t\t\t\t\t\t\t\t\t\tHOURS IN NON-CSE TECH ELECTIVES:\t\t\t\t\t\t\t\t\t\t\t\tAIR SCIENCE, ASTRONOMY & ASTROPHYSICS, BIOCHEMISTRY,\t\t\t\t\t\t\t\t\t\t\t\tBIOLOGY, BIOMEDICAL ENGINEERING, BUSINESS, BUSINESS\t\t\t\t\t\t\t\t\t\t\t\tANALYTICS, CHEMISTRY, COGNITIVE SCIENCE, COMMUNICATION\t\t\t\t\t\t\t\t\t\t\t\tTECHNOLOGY, DESIGN FOUNDATIONS, DESIGN THINKING,\t\t\t\t\t\t\t\t\t\t\t\tEARTH SCIENCES (ANY TRACK), ECONOMICS, ENTREPRENEURSHIP\t\t\t\t\t\t\t\t\t\t\t\t& INNOVATION, ENVIRONMENTAL ENGINEERING, FORENSIC\t\t\t\t\t\t\t\t\t\t\t\tSCIENCE, GAMES STUDIES, GEOGRAPHIC INFORMATION SCIENCE,\t\t\t\t\t\t\t\t\t\t\t\tINFORMATION SECURITY, LINGUISTICS, MATH, MICROBIOLOGY,\t\t\t\t\t\t\t\t\t\t\t\tMILITARY SCIENCE, MOLECULAR GENETICS, NAVAL SCIENCE,\t\t\t\t\t\t\t\t\t\t\t\tNEUROSCIENCE, NUCLEAR ENGINEERING, PHYSICS,\t\t\t\t\t\t\t\t\t\t\t\tPROFESSIONAL WRITING, PSYCHOLOGY (ANY TRACK), SECURITY\t\t\t\t\t\t\t\t\t\t\t\t& INTELLIGENCE, STATISTICS, AND STUDIO ART.",
        "needs": "",
        "list": []
    },
    {
        "title": "COMPLETE 17 TECHNICAL ELECTIVE HOURS FROM THE FOLLOWI\t\t\t\t\t\t\t\t\t\t\t\t1.) & 2.) CSE TECHNICAL ELECTIVES - A MINIMUM OF 9 CSE\t\t\t\t\t\t\t\t\t\t\t\tHOURS ARE REQUIRED\t\t\t\t\t\t\t\t\t\t\t\t3.) NON-CSE TECHNICAL ELECTIVES - UP TO 8 HOURS MAY BE\t\t\t\t\t\t\t\t\t\t\t\tUSED OR COMPLETE ONE OF THE BELOW LISTED MINORS.",
        "needs": "",
        "list": []
    },
    {
        "title": "1.) REQUIRED COURSES - CHOOSE 2 COURSES",
        "needs": "NEEDS:    2    COURSES",
        "list": [
            "CSE 5243",
            "CSE 5523",
            "CSE 5524",
            "CSE 5525",
            "CSE 5526"
        ]
    },
    {
        "title": "2.) CSE TECHNICAL ELECTIVES - COMPLETE 3 HOURS FROM CSE\t\t\t\t\t\t\t\t\t\t\t\t3000-5999, PSYCH 5612\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tRESTRICTIONS:\t\t\t\t\t\t\t\t\t\t\t\t425X - A MAXIMUM OF 2 HOURS MAY COUNT\t\t\t\t\t\t\t\t\t\t\t\t4193 - A MAXIMUM OF 2 HOURS MAY COUNT\t\t\t\t\t\t\t\t\t\t\t\t4998 - A MAXIMUM OF 3 HOURS MAY COUNT\t\t\t\t\t\t\t\t\t\t\t\t4999 - A MAXIMUM OF 6 HOURS MAY COUNT\t\t\t\t\t\t\t\t\t\t\t\t(4193+4998+4999) - A MAXIMUM OF 6 HOURS MAY COUNT FROM\t\t\t\t\t\t\t\t\t\t\t\tALL THREE COURSE NUMBERS",
        "needs": "",
        "list": []
    },
    {
        "title": "3.) NON-CSE TECHNICAL ELECTIVES (MAX 8 HOURS MAY COUNT)",
        "needs": "",
        "list": [
            "ACCAD 3350",
            "ACCAD 4101",
            "ACCAD 5001",
            "ACCAD 5002",
            "ACCAD 5003",
            "ACCAD 5100",
            "ACCAD 5141",
            "ACCAD 5150",
            "ACCAD 5301",
            "ACCAD 5500",
            "ACCTMIS 2000",
            "ACCTMIS 2200",
            "ACCTMIS 2300"
        ]
    },
    {
        "title": "COMPLETE ONE OF THE FOLLOWING MINORS INSTEAD OF THE 8\t\t\t\t\t\t\t\t\t\t\t\tHOURS IN NON-CSE TECH ELECTIVES:\t\t\t\t\t\t\t\t\t\t\t\tAIR SCIENCE, ASTRONOMY & ASTROPHYSICS, BIOCHEMISTRY,\t\t\t\t\t\t\t\t\t\t\t\tBIOLOGY, BIOMEDICAL ENGINEERING, BUSINESS, BUSINESS\t\t\t\t\t\t\t\t\t\t\t\tANALYTICS, CHEMISTRY, COGNITIVE SCIENCE, COMMUNICATION\t\t\t\t\t\t\t\t\t\t\t\tTECHNOLOGY, DESIGN FOUNDATIONS, DESIGN THINKING,\t\t\t\t\t\t\t\t\t\t\t\tEARTH SCIENCES (ANY TRACK), ECONOMICS, ENTREPRENEURSHIP\t\t\t\t\t\t\t\t\t\t\t\t& INNOVATION, ENVIRONMENTAL ENGINEERING, FORENSIC\t\t\t\t\t\t\t\t\t\t\t\tSCIENCE, GAMES STUDIES, GEOGRAPHIC INFORMATION SCIENCE,\t\t\t\t\t\t\t\t\t\t\t\tINFORMATION SECURITY, LINGUISTICS, MATH, MICROBIOLOGY,\t\t\t\t\t\t\t\t\t\t\t\tMILITARY SCIENCE, MOLECULAR GENETICS, NAVAL SCIENCE,\t\t\t\t\t\t\t\t\t\t\t\tNEUROSCIENCE, NUCLEAR ENGINEERING, PHYSICS,\t\t\t\t\t\t\t\t\t\t\t\tPROFESSIONAL WRITING, PSYCHOLOGY (ANY TRACK), SECURITY\t\t\t\t\t\t\t\t\t\t\t\t& INTELLIGENCE, STATISTICS, AND STUDIO ART.",
        "needs": "",
        "list": []
    },
    {
        "title": "TAKE THE GE LAUNCH SEMINAR",
        "needs": "",
        "list": [
            "GENED 1201"
        ]
    },
    {
        "title": "WRITING AND INFORMATION LITERACY - COMPLETE",
        "needs": "",
        "list": []
    },
    {
        "title": "MATHEMATICAL AND QUANTITATIVE REASONING OR DATA\t\t\t\t\t\t\t\t\t\t\t\tANALYSIS - TAKE ONE COURSE",
        "needs": "",
        "list": [
            "AEDECON 2005",
            "ANIMSCI 2260",
            "ASTRON 3350",
            "CHEM 2210",
            "CHEM 2210H",
            "COMLDR 3537",
            "CSE 1111"
        ]
    },
    {
        "title": "LITERARY, VISUAL AND PERFORMING ARTS - TAKE ONE COURSE",
        "needs": "",
        "list": [
            "AFAMAST 2101",
            "AFAMAST 2251",
            "AFAMAST 2253",
            "AFAMAST 2270",
            "AFAMAST 2281",
            "AFAMAST 2288",
            "MUSIC 2288",
            "AFAMAST 2367.01",
            "AFAMAST 2367.04"
        ]
    },
    {
        "title": "HISTORICAL AND CULTURAL STUDIES - COMPLETE",
        "needs": "",
        "list": []
    },
    {
        "title": "NATURAL SCIENCE - TAKE ONE COURSE OR SET OF COURSES FOR\t\t\t\t\t\t\t\t\t\t\t\tA MINIMUM OF 4 HOURS",
        "needs": "",
        "list": [
            "ANTHROP 2200",
            "ANTHROP 2200H",
            "ASTRON 1101",
            "BIOLOGY 1101",
            "BIOLOGY 1110",
            "BIOLOGY 1113.01",
            "BIOLOGY 1113.02",
            "BIOLOGY 1113H"
        ]
    },
    {
        "title": "",
        "needs": "",
        "list": [
            "EARTHSC 1200",
            "EARTHSC 1105",
            "EARTHSC 1108",
            "EARTHSC 1110",
            "EARTHSC 1110H",
            "EARTHSC 1110S",
            "EARTHSC 1121",
            "EARTHSC 1151"
        ]
    },
    {
        "title": "SOCIAL AND BEHAVIORAL SCIENCES - COMPLETE",
        "needs": "",
        "list": []
    },
    {
        "title": "RACE, ETHNICITY AND GENDER DIVERSITY - COMPLETE",
        "needs": "",
        "list": []
    },
    {
        "title": "COMPLETE THE CITIZENSHIP FOR A DIVERSE AND JUST WORLD\t\t\t\t\t\t\t\t\t\t\t\tTHEME. COMPLETE 1-2 COURSES FOR A MINIMUM OF 4-6 HOURS.",
        "needs": "",
        "list": []
    },
    {
        "title": "CITIZENSHIP FOR A DIVERSE AND JUST WORLD",
        "needs": "",
        "list": [
            "AFAMAST 3083",
            "AFAMAST 3084",
            "AFAMAST 3110",
            "AFAMAST 3440",
            "AFAMAST 3450",
            "AFAMAST 4610",
            "AFAMAST 4921",
            "ANTHROP 3306"
        ]
    },
    {
        "title": "SELECT ONE OF THE FOLLOWING THEMES AND COMPLETE 1-2\t\t\t\t\t\t\t\t\t\t\t\tCOURSES FOR A MINIMUM OF 4-6 HOURS.",
        "needs": "",
        "list": []
    },
    {
        "title": "LIVED ENVIRONMENTS",
        "needs": "",
        "list": [
            "AFAMAST 2367.07",
            "AFAMAST 3260",
            "AFAMAST 3310",
            "AGRCOMM 2330",
            "ANTHROP 3072",
            "ANTHROP 3411",
            "ANTHROP 3623"
        ]
    },
    {
        "title": "HEALTH AND WELLBEING",
        "needs": "",
        "list": [
            "AFAMAST 5650",
            "ANTHROP 3301",
            "ANTHROP 3302",
            "ANTHROP 3340",
            "ANTHROP 4706",
            "ANTHROP 5601",
            "ANTHROP 5602",
            "ANTHROP 5700"
        ]
    },
    {
        "title": "SUSTAINABILITY",
        "needs": "",
        "list": [
            "AEDECON 2501",
            "AEDECON 2501E",
            "ANTHROP 3050",
            "ANTHROP 3411",
            "ANTHROP 3623",
            "ANTHROP 4597.03",
            "CIVILEN 3530"
        ]
    },
    {
        "title": "MIGRATION, MOBILITY, AND IMMOBILITY",
        "needs": "",
        "list": [
            "AFAMAST 3086",
            "AFAMAST 3370",
            "AFAMAST 3376",
            "ANTHROP 3419",
            "ANTHROP 5627"
        ]
    },
    {
        "title": "NUMBER, NATURE, MIND",
        "needs": "",
        "list": [
            "ASTRON 2142",
            "ASTRON 2143",
            "LING 3802",
            "LING 4052"
        ]
    },
    {
        "title": "ORIGINS AND EVOLUTION",
        "needs": "",
        "list": [
            "ANTHROP 3300",
            "ANTHROP 3409",
            "ANTHROP 5609",
            "ASTRON 2140",
            "ASTRON 2141",
            "ASTRON 2143"
        ]
    },
    {
        "title": "TRADITIONS, CULTURES, TRANSFORMATIONS",
        "needs": "",
        "list": [
            "AFAMAST 3230",
            "AFAMAST 4342",
            "AFAMAST 4571",
            "ANTHROP 3452",
            "CLAS 3217",
            "CLAS 3223",
            "COMPSTD 3130H"
        ]
    },
    {
        "title": "STUDENTS WHO ARE ENROLLED IN THE COLLEGE OF ENGINEERING\t\t\t\t\t\t\t\t\t\t\t\tOR KNOWLTON SCHOOL WILL BE ALLOWED TO USE A COURSE\t\t\t\t\t\t\t\t\t\t\t\tREQUIRED FOR THEIR MAJOR FOR THE GE REFLECTION SEMINAR.",
        "needs": "",
        "list": []
    },
    {
        "title": "",
        "needs": "",
        "list": []
    }
]
lukew3 commented 2 months ago

The subrequirements are designed to be interpreted by a human, so it don't seem to provide a straightforward machine readable interpretation of requirements. Some regex maybe could be used to get better insights about requirements, but I don't think that this would be complete. LLMs might be able to produce machine-readable data, but I think that there may be concerns about accuracy (better than what we have now though I guess)

lukew3 commented 2 months ago
{
        "title": "CSE CORE CHOICES II - TAKE 3 COURSES : CHOOSE 1 COURSE\t\t\t\t\t\t\t\t\t\t\t\tFROM EACH SET",
        "needs": "NEEDS:    3    COURSES",
        "list": [
            "CSE 3901",
            "CSE 3902",
            "CSE 3903",
            "CSE 3231",
            "CSE 3241",
            "CSE 3421",
            "CSE 3461"
        ]
    },

could probably fix this error methodically by reading courselist text