Scrape relevant pages from schedules.calpoly.edu

kkevlar commented 5 years ago

To support #7, need to download html pages from schedules.calpoly.edu in order to parse them with the html parser.

Unclear what the inputs to this should be, but probably should scrape every class in a department code.

kkevlar commented 5 years ago

Unclear what pages are "relevant" in this situation. Should the input to this module be a department code?

JustinPrivitera commented 5 years ago


import java.net.*;
import java.io.*;

public class Scraper
{
    public static void main(String[] args) {
        URL url;
        InputStream is = null;
        BufferedReader br;
        PrintWriter bw;
        String line;
        File file;

        try {
            file = new File("/home/justin/Documents/csc309/try1.html");
            file.createNewFile();
            url = new URL("http://schedules.calpoly.edu/classes_CPE-329_curr.htm");
            is = url.openStream();  // throws an IOException
            br = new BufferedReader(new InputStreamReader(is));
            bw = new PrintWriter(file);

            while ((line = br.readLine()) != null) {
                bw.println(line);
            }

            bw.close();

        } catch (MalformedURLException mue) {
             mue.printStackTrace();
        } catch (IOException ioe) {
             ioe.printStackTrace();
        } finally {
            try {
                if (is != null) is.close();
            } catch (IOException ioe) {
                // nothing to see here
            }
        }
    }
}

kkevlar commented 5 years ago

try (final WebClient webClient = new WebClient()) {
            final HtmlPage page = webClient.getPage("http://schedules.calpoly.edu/subject_CSC_curr.htm");
            final HtmlTable table = page.getHtmlElementById("listing");
            for (final HtmlTableRow row : table.getRows()) {
                System.out.print("\n");
                for (final HtmlTableCell cell : row.getCells()) {
                    System.out.print("\"" + cell.asText() + "\"" +",");
                }
            }
        }

Using http://htmlunit.sourceforge.net/table-howto.html

JustinPrivitera commented 5 years ago

package logic;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;

import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlTable;
import com.gargoylesoftware.htmlunit.html.HtmlTableCell;
import com.gargoylesoftware.htmlunit.html.HtmlTableRow;

public class WebScraper 
{
    ArrayList<ArrayList<String>> fun;

    public static void start() 
    {
        WebScraper scraper = new WebScraper();
        try {
            System.out.println(scraper.scrapeCoursesByDept("CSC"));
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public List<Course> scrapeCoursesByDept(String dept)
    {
        String combinedCSV = "";

        try (final WebClient webClient = new WebClient()) {
            final HtmlPage page = webClient.getPage("http://schedules.calpoly.edu/subject_" + dept + "_curr.htm");
            final HtmlTable table = page.getHtmlElementById("listing");
            for (final HtmlTableRow row : table.getRows()) {
                combinedCSV += ("\n");
                for (final HtmlTableCell cell : row.getCells()) {
                     combinedCSV += ( cell.asText() +",");
                }
            }
        } catch (FailingHttpStatusCodeException e) 
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) 
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) 
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        if(combinedCSV.length() > 1)
        {
            combinedCSV = combinedCSV.substring(1);
        }
        else
        {
            return null;
        }

        return get_Course_List(ParseHtml.parselines(combinedCSV));

    }

    public List<Course> get_Course_List(ArrayList<ArrayList<String>> class_list)
    {
        ArrayList<Course> course_list;
        ArrayList<Section> sect_list;
        String id;
        int i, j;

        course_list = new ArrayList<Course>();
        i = 1;
        while (i < class_list.size())
        {
            System.out.println(i);
            System.out.println(class_list.get(i));
            CourseData data = new CourseData(class_list.get(i));
            Course course = new Course(CourseData.course);
            sect_list = new ArrayList<Section>();
            id = data.course.substring(0, 7); //screws up with the 8 character P___ courses
            j = i;
            while (j < class_list.size() && id.equals(data.course.substring(0, 7)))
            {
                CourseData data1 = new CourseData(class_list.get(j));
                Course course1 = new Course(CourseData.course);
                id = data1.str_CourseData().substring(0, 7);
                sect_list.add(new Section(CourseData.sect, new TimeBlock[2], course, CourseData.lcap - CourseData.enrl));
                j ++;
            }
            i = j + 1;
            course.setSections(sect_list);
            course_list.add(course);
        }

        return course_list;
    }

}

package logic;

import java.util.ArrayList;

public class CourseData
{
    public static String course, sect, id, type, ge, req, days, start, end, instructor, location, ics;
    public static int lcap, ecap, enrl, wait, drop;

    public CourseData(ArrayList<String> data)
    {
        this.course = data.get(0);
        this.sect = data.get(1);
        this.id = data.get(2);
        this.type = data.get(3);
        this.ge = data.get(4);
        this.req = data.get(5);
        this.days = data.get(6);
        this.start = data.get(7);
        this.end = data.get(8);
        this.instructor = data.get(9);
        this.location = data.get(10);
        try
        {
            if (data.get(11).length() != 0)
                this.lcap = Integer.parseInt(data.get(11));
            else
                this.lcap = -1;
        }
        catch (NumberFormatException e)
        {
            this.lcap = -1;
        }
        try
        {
            if (data.get(12).length() != 0)
                this.ecap = Integer.parseInt(data.get(12));
            else
                this.ecap = -1;
        }
        catch (NumberFormatException e)
        {
            this.ecap = -1;
        }
        try
        {
            if (data.get(13).length() != 0)
                this.enrl = Integer.parseInt(data.get(13));
            else
                this.enrl = -1;
        }
        catch (NumberFormatException e)
        {
            this.enrl = -1;
        }
        try
        {
            if (data.get(14).length() != 0)
                this.wait = Integer.parseInt(data.get(14));
            else
                this.wait = -1;
        }
        catch (NumberFormatException e)
        {
            this.wait = -1;
        }
        if (data.size() > 15)
        {
            try
            {
                if (data.get(15).length() != 0)
                    this.drop = Integer.parseInt(data.get(15));
                else
                    this.drop = -1;
            }
            catch (NumberFormatException e)
            {
                this.drop = -1;
            }
        }
        else
        {
            this.drop = -1;
        }       
        if (data.size() > 16)
        {
            this.ics = data.get(16);
        }
        else
        {
            this.ics = "";
        }
    }

    public String str_CourseData()
    {
        String ret_str;
        ret_str = "Course: " + course + "\n";
        ret_str += "Section: " + sect + "\n";
        ret_str += "ID: " + id + "\n";
        ret_str += "Type: " + type + "\n";
        ret_str += "GE: " + ge + "\n";
        ret_str += "Requirement: " + req + "\n";
        ret_str += "Days: " + days + "\n";
        ret_str += "Start: " + start + "\n";
        ret_str += "End: " + end + "\n";
        ret_str += "Instructor: " + instructor + "\n";
        ret_str += "Location: " + location + "\n";
        ret_str += "lcap: " + String.valueOf(lcap) + "\n";
        ret_str += "ecap: " + String.valueOf(ecap) + "\n";
        ret_str += "enrl: " + String.valueOf(enrl) + "\n";
        ret_str += "wait: " + String.valueOf(wait) + "\n";
        ret_str += "drop: " + String.valueOf(drop) + "\n";
        ret_str += "ICS: " + ics + "\n";
        return ret_str;
    }
}

kkevlar / section-searcher

Scrape relevant pages from schedules.calpoly.edu #49