Open rkwahile opened 6 years ago
Python
basics-- https://likegeeks.com/python-programming-basics/
http://www.diveintopython3.net/
https://blog.hartleybrody.com/web-scraping-cheat-sheet/ https://stackoverflow.com/questions/26675546/crawl-specific-pages-and-data-and-make-it-searchable
Beautiful Soup
from future import print_function from bs4 import BeautifulSoup import sys import requests import time
path = 'keywords.txt' keywords_file = open(path,'r') keywords = keywords_file.readlines() rowFile = [x.replace('\n', '') for x in keywords] headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} write_path = 'searchOutput.txt' output_file = open(write_path,'w')
def parseTable(tables): result = [] for table in tables: table_rows = table.find_all('tr') for tr in table_rows: td = tr.find_all('td') row = [i.get_text() for i in td] rowUtf = [x.encode('utf-8') for x in row] rowClean = [x.replace('\n', '') for x in rowUtf] result.append(rowClean) return result
for item in rowFile: searchQuery = item.replace(' ','-') url = 'https://www.zauba.com/import-%s-hs-code.html' % searchQuery time.sleep(10) while True: page = requests.get(url, headers=headers, timeout=5) if (page.status_code == 200): break soup = BeautifulSoup(page.content, 'html.parser') tables = soup.findAll('table') mainList = parseTable(tables) output = [] delim = "\t" for item in mainList: if len(item)>5: log = open("/searchOutput.txt", "a") print(item[1],delim,item[2], file = log)
Scrapy Web Crawling
import scrapy import json from scrapy.selector import Selector from scrapy.http import HtmlResponse from requests import Request from scrapy.http import FormRequest from scrapy.contrib.loader import XPathItemLoader
class QuotesSpider(scrapy.Spider): name = "crawl"
def start_requests(self):
path = '3ceKeywords.txt'
keywords_file = open(path,'r')
keywords = keywords_file.readlines()
rowFile = [x.replace('\n', '') for x in keywords]
for proddesc in rowFile:
print (proddesc)
#proddesc=input("Enter the commodity you want to search")
formdata={"state":"start","proddesc":"%s" % proddesc,"lang":"en","username":"NOT_SET","userData":{"correlationTxId":""},"destination":"US","origin":"US","schedule":"import/export","profileId":"57471f0c4ac2c9b910000000"}
request_body=json.dumps(formdata)
print(request_body)
yield scrapy.Request("https://uscensus.prod.3ceonline.com/ui/classify",callback=self.parse_output,method="POST",body=request_body,headers={'Content-Type': 'application/json; charset=UTF-8'})
def parse_output(self,response):
filename='final_output'
with open(filename,'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
data = json.load(open('myscrapyproject/final_output'))
print(data)
prodDesc=data["productDescription"]
potential=data["potentialHeadings"]
currentItem=data["currentItemPaths"]
print(data)
print(str(prodDesc) + ' ' +str(potential)+ ' '+str(currentItem))
print("-----------------------------------")
print(type(data))
USING CONDA https://www.digitalocean.com/community/tutorials/how-to-install-the-anaconda-python-distribution-on-ubuntu-16-04
Oracle Goldengate http://www.oracle.com/webfolder/technetwork/tutorials/obe/fmw/goldengate/11g/GGS_Sect_Config_WinUX_ORA_to_WinUX_ORA_datapump/GGS_Sect_Config_WinUX_ORA_to_WinUX_ORA_datapump.pdf
https://www.doag.org/formes/pubfiles/9558103/2017-DB-Patrick_Hurley-GoldenGate__a_Live_Introduction-Manuskript.pdf