Closed kkevlar closed 5 years ago
Unclear what pages are "relevant" in this situation. Should the input to this module be a department code?
import java.net.*;
import java.io.*;
public class Scraper
{
public static void main(String[] args) {
URL url;
InputStream is = null;
BufferedReader br;
PrintWriter bw;
String line;
File file;
try {
file = new File("/home/justin/Documents/csc309/try1.html");
file.createNewFile();
url = new URL("http://schedules.calpoly.edu/classes_CPE-329_curr.htm");
is = url.openStream(); // throws an IOException
br = new BufferedReader(new InputStreamReader(is));
bw = new PrintWriter(file);
while ((line = br.readLine()) != null) {
bw.println(line);
}
bw.close();
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (is != null) is.close();
} catch (IOException ioe) {
// nothing to see here
}
}
}
}
try (final WebClient webClient = new WebClient()) {
final HtmlPage page = webClient.getPage("http://schedules.calpoly.edu/subject_CSC_curr.htm");
final HtmlTable table = page.getHtmlElementById("listing");
for (final HtmlTableRow row : table.getRows()) {
System.out.print("\n");
for (final HtmlTableCell cell : row.getCells()) {
System.out.print("\"" + cell.asText() + "\"" +",");
}
}
}
package logic;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlTable;
import com.gargoylesoftware.htmlunit.html.HtmlTableCell;
import com.gargoylesoftware.htmlunit.html.HtmlTableRow;
public class WebScraper
{
ArrayList<ArrayList<String>> fun;
public static void start()
{
WebScraper scraper = new WebScraper();
try {
System.out.println(scraper.scrapeCoursesByDept("CSC"));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public List<Course> scrapeCoursesByDept(String dept)
{
String combinedCSV = "";
try (final WebClient webClient = new WebClient()) {
final HtmlPage page = webClient.getPage("http://schedules.calpoly.edu/subject_" + dept + "_curr.htm");
final HtmlTable table = page.getHtmlElementById("listing");
for (final HtmlTableRow row : table.getRows()) {
combinedCSV += ("\n");
for (final HtmlTableCell cell : row.getCells()) {
combinedCSV += ( cell.asText() +",");
}
}
} catch (FailingHttpStatusCodeException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MalformedURLException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
if(combinedCSV.length() > 1)
{
combinedCSV = combinedCSV.substring(1);
}
else
{
return null;
}
return get_Course_List(ParseHtml.parselines(combinedCSV));
}
public List<Course> get_Course_List(ArrayList<ArrayList<String>> class_list)
{
ArrayList<Course> course_list;
ArrayList<Section> sect_list;
String id;
int i, j;
course_list = new ArrayList<Course>();
i = 1;
while (i < class_list.size())
{
System.out.println(i);
System.out.println(class_list.get(i));
CourseData data = new CourseData(class_list.get(i));
Course course = new Course(CourseData.course);
sect_list = new ArrayList<Section>();
id = data.course.substring(0, 7); //screws up with the 8 character P___ courses
j = i;
while (j < class_list.size() && id.equals(data.course.substring(0, 7)))
{
CourseData data1 = new CourseData(class_list.get(j));
Course course1 = new Course(CourseData.course);
id = data1.str_CourseData().substring(0, 7);
sect_list.add(new Section(CourseData.sect, new TimeBlock[2], course, CourseData.lcap - CourseData.enrl));
j ++;
}
i = j + 1;
course.setSections(sect_list);
course_list.add(course);
}
return course_list;
}
}
package logic;
import java.util.ArrayList;
public class CourseData
{
public static String course, sect, id, type, ge, req, days, start, end, instructor, location, ics;
public static int lcap, ecap, enrl, wait, drop;
public CourseData(ArrayList<String> data)
{
this.course = data.get(0);
this.sect = data.get(1);
this.id = data.get(2);
this.type = data.get(3);
this.ge = data.get(4);
this.req = data.get(5);
this.days = data.get(6);
this.start = data.get(7);
this.end = data.get(8);
this.instructor = data.get(9);
this.location = data.get(10);
try
{
if (data.get(11).length() != 0)
this.lcap = Integer.parseInt(data.get(11));
else
this.lcap = -1;
}
catch (NumberFormatException e)
{
this.lcap = -1;
}
try
{
if (data.get(12).length() != 0)
this.ecap = Integer.parseInt(data.get(12));
else
this.ecap = -1;
}
catch (NumberFormatException e)
{
this.ecap = -1;
}
try
{
if (data.get(13).length() != 0)
this.enrl = Integer.parseInt(data.get(13));
else
this.enrl = -1;
}
catch (NumberFormatException e)
{
this.enrl = -1;
}
try
{
if (data.get(14).length() != 0)
this.wait = Integer.parseInt(data.get(14));
else
this.wait = -1;
}
catch (NumberFormatException e)
{
this.wait = -1;
}
if (data.size() > 15)
{
try
{
if (data.get(15).length() != 0)
this.drop = Integer.parseInt(data.get(15));
else
this.drop = -1;
}
catch (NumberFormatException e)
{
this.drop = -1;
}
}
else
{
this.drop = -1;
}
if (data.size() > 16)
{
this.ics = data.get(16);
}
else
{
this.ics = "";
}
}
public String str_CourseData()
{
String ret_str;
ret_str = "Course: " + course + "\n";
ret_str += "Section: " + sect + "\n";
ret_str += "ID: " + id + "\n";
ret_str += "Type: " + type + "\n";
ret_str += "GE: " + ge + "\n";
ret_str += "Requirement: " + req + "\n";
ret_str += "Days: " + days + "\n";
ret_str += "Start: " + start + "\n";
ret_str += "End: " + end + "\n";
ret_str += "Instructor: " + instructor + "\n";
ret_str += "Location: " + location + "\n";
ret_str += "lcap: " + String.valueOf(lcap) + "\n";
ret_str += "ecap: " + String.valueOf(ecap) + "\n";
ret_str += "enrl: " + String.valueOf(enrl) + "\n";
ret_str += "wait: " + String.valueOf(wait) + "\n";
ret_str += "drop: " + String.valueOf(drop) + "\n";
ret_str += "ICS: " + ics + "\n";
return ret_str;
}
}
To support #7, need to download html pages from schedules.calpoly.edu in order to parse them with the html parser.
Unclear what the inputs to this should be, but probably should scrape every class in a department code.