Open itsnamgyu opened 5 years ago
mma_crawler
import pathlib
import pickle
import socket
import time
import traceback
import requests
import bs4
class PickleDatabase():
def __init__(self, filename='db.pkl'):
if not pathlib.Path(filename).is_file():
with open(filename, 'wb') as f:
pickle.dump(set(), f)
with open(filename, 'rb') as f:
self._db = pickle.load(f)
self._filename = filename
def commit(self):
with open(self._filename, 'wb') as f:
return pickle.dump(self._db, f)
def update(self, value):
self._db.add(value)
def find(self, value):
return value in self._db
def parse_board():
req = requests.post(
url='https://work.mma.go.kr/caisBYIS/board/boardList.do',
data={
'gesipan_gbcd': '13',
'tmpl_id': '1',
'menu_id': 'm_m8_6',
'pageUnit': '100',
}
)
soup = bs4.BeautifulSoup(req.text, 'html.parser')
rows = soup.find('table', class_='brd_list_n').tbody.find_all('tr')
ret = []
for row in rows:
tds = row.find_all('td')
name = tds[0].a.text.strip()
url = tds[0].a['onclick']
ret.append((name, url))
return ret
def parse_url(url):
# HACK: 들어오는 url을 param list로 변경해줌.
# 예) url = "javascript:fnBoardView('m_m8_6','13','2000127135','','','1','10');"
# -> params = ['m_m8_6', '13', '2000127135', '', '', '1', '10']
# HACK:
params = [x[1:-1] for x in url.replace('(', ',').replace(')', ',').split(',')[1:-1]]
return {
'menu_id': params[0],
'gesipan_gbcd': params[1],
'ilryeon_no': params[2],
'searchCondition': params[3],
'searchKeyword': params[4],
'pageIndex': params[5],
'pageUnit': params[6],
}
def parse_page(data):
req = requests.post(
url='https://work.mma.go.kr/caisBYIS/board/boardView.do',
data=data
)
soup = bs4.BeautifulSoup(req.text, 'html.parser')
rows = soup.find('table', class_='brd_view').find_all('tr')
files = []
for a in rows[3].td.find_all('a'):
files.append((a.text.strip(), 'https://work.mma.go.kr' + a['href']))
return {
'title': rows[0].td.text.strip(),
'writer': rows[1].td.text.strip(),
'date': rows[2].td.text.strip(),
'content': '\n'.join(rows[4].td.strings),
'files': files
}
def send_to_slack(title, writer, date, content, files):
requests.post(
SLACK_URL,
json={
'attachments': [
{
'title': title,
'author_name': writer + ' ' + date,
'author_link': 'https://work.mma.go.kr/caisBYIS/main.do',
'text': content,
'fields': [ # START of list comprehension
{'value': '<{}|{}>'.format(url, title), 'short': False}
for title, url in files
] # END of list comprehension.
}
],
}
)
def send_error_to_slack(content):
requests.post(
SLACK_URL,
json={
'attachments': [
{
'title': 'Exception raised',
'author_name': socket.gethostname(),
'text': content,
}
],
}
)
def main():
db = PickleDatabase()
for name, url in parse_board():
if not db.find((name, url)):
send_to_slack(**parse_page(parse_url(url)))
db.update((name, url))
db.commit()
if __name__ == '__main__':
while True:
try:
main()
except KeyboardInterrupt:
break
except Exception:
send_error_to_slack(traceback.format_exc())
time.sleep(600)
Abstract
Basically, SGU cyber campus is horrible and Abeek is abysmal. Consider this: let's say someone (that cares about grades) checks these websites 4 times a day on weekdays. That is about 5 minutes of hard time lost, each. Across the average school semester of 15 weeks, that's 300 checks—equivalent to 25 hours.
We must also consider indirect costs such as time lost from interrupted streaks of productivity, hustling after finding out about the assignment at the last minute etc. Let's approximate this cost to 5 additional minutes per check. This puts our net loss of time per student as 50 hours per semester.
Here are the main issues IMO:
If we can provide a single channel, well-designed service (possibly with push notifications) and cut this time in half, and market the website to reach just 100 people, that is a net gain of 2,500 hours (3 months!) of extra productivity for the world, per semester. Assuming we hit that goal, it is likely that the userbase of the service will draw more users, encourage further development and draw attention from TA's and professors.
So... What Are We Making?
Almost forgot to mention that. Will update soon.
Approach (First Things First)
Currently, I'm trying to identify whether if students are interested in this idea. Once that is confirmed, I want to explore the workflow for TA's and professors. Why use the old Abeek system, or proprietary lab sites? This project should start there—considering the experience of users at both ends.
Original Idea: School Assignment Board Crawler
This is great, but only few people will be able to use this.
Dream...
Any comments, feedback, stars, etc. will be greatly appreciated. Feel free to email me on this, anytime.