YanyangChen / stock_analysis

0 stars 0 forks source link

web scraping #4

Open YanyangChen opened 5 years ago

YanyangChen commented 5 years ago

https://www.dataquest.io/blog/web-scraping-tutorial-python/

YanyangChen commented 5 years ago
import requests
from bs4 import BeautifulSoup

proxies = {
'http': 'http://adb.def.hk:8080/',
}

page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
print(list(soup.children))
print([type(item) for item in list(soup.children)])
html = list(soup.children)[2]
body = list(html.children)[3]
print(list(body.children))
p = list(body.children)[1]
print(p.get_text())
p.get_text()
# print(page.content)
YanyangChen commented 5 years ago
import sqlite3
import requests
from bs4 import BeautifulSoup
from sqlite3 import Error

# http://www.sqlitetutorial.net/sqlite-python/sqlite-python-select/

def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)

    return None

def select_all_tasks(conn):
    """
    Query all rows in the tasks table
    :param conn: the Connection object
    :return:
    """
    cur = conn.cursor()
    cur.execute("SELECT * FROM STOCKS")

    rows = cur.fetchall()

    for row in rows:
        print(row)

def select_task_by_priority(conn, priority):
    """
    Query tasks by priority
    :param conn: the Connection object
    :param priority:
    :return:
    """
    cur = conn.cursor()
    cur.execute("SELECT * FROM STOCKS")

    rows = cur.fetchall()

    for row in rows:
        print(row)

def create_stock(conn, stock):
    """
    Create a new task
    :param conn:
    :param task:
    :return:
    """

    sql = ''' INSERT INTO stocks(idx,stkdate,open,close,volumn)
              VALUES(?,?,?,?,?) '''
    cur = conn.cursor()
    cur.execute(sql, stock)
    return cur.lastrowid

def web_scrap():
        page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
        soup = BeautifulSoup(page.content, 'html.parser')
        print(soup.prettify())
        print(list(soup.children))
        print([type(item) for item in list(soup.children)])
        html = list(soup.children)[2]
        body = list(html.children)[3]
        print(list(body.children))
        p = list(body.children)[1]
        print(p.get_text())
        p.get_text()

def main():
    database = "/Users/chenyanyang/tst.db"

    # create a database connection
    conn = create_connection(database)
    with conn:
        # print("1. Query task by priority:")
        # select_task_by_priority(conn, 1)
        stock_1 = ('0700.HK', 'Jun-02-2036', 300, 301, 87000)
        try:
           create_stock(conn, stock_1)
        except sqlite3.IntegrityError:
            print("duplicate data")
        finally:
            print("2. Query all tasks")
            select_all_tasks(conn)

            proxies = {
            'http': 'http://adb.def.hk:8080/',
            }

            web_scrap()

if __name__ == '__main__':
    main()
YanyangChen commented 5 years ago

def web_scrap(self):

page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")

        page = requests.get("https://finance.yahoo.com/quote/0700.HK/history?period1=1471968000&period2=1535040000&interval=1d&filter=history&frequency=1d")

        soup = BeautifulSoup(page.content, 'html.parser')
        # print(soup.prettify())
        # print("-----------------list(soup.children)-------------------")
        # print(list(soup.children)[1])
        # print("-----------------[type(item) for item in list(soup.children)]-------------------")
        # print([type(item) for item in list(soup.children)[1]])
        print(soup.find_all('td'))
        print([item for item in list(soup.find_all('td'))])
        for item in list(soup.find_all('td')):
            print(item.find('span').get_text())
        # html = list(soup.children)[2]
        # body = list(html.children)[3]
        # print(list(body.children))
        # p = list(body.children)[1]
        # print(p.get_text())
        # p.get_text()