rkwahile / public

0 stars 0 forks source link

Oracle Goldengate & Python #6

Open rkwahile opened 6 years ago

rkwahile commented 6 years ago

USING CONDA https://www.digitalocean.com/community/tutorials/how-to-install-the-anaconda-python-distribution-on-ubuntu-16-04

Oracle Goldengate http://www.oracle.com/webfolder/technetwork/tutorials/obe/fmw/goldengate/11g/GGS_Sect_Config_WinUX_ORA_to_WinUX_ORA_datapump/GGS_Sect_Config_WinUX_ORA_to_WinUX_ORA_datapump.pdf

https://www.doag.org/formes/pubfiles/9558103/2017-DB-Patrick_Hurley-GoldenGate__a_Live_Introduction-Manuskript.pdf

rkwahile commented 6 years ago

Python

basics-- https://likegeeks.com/python-programming-basics/

http://www.diveintopython3.net/

https://blog.hartleybrody.com/web-scraping-cheat-sheet/ https://stackoverflow.com/questions/26675546/crawl-specific-pages-and-data-and-make-it-searchable

mechanize and beautiful soup----------- Enforcing the input field to a website http://stockrt.github.io/p/handling-html-forms-with-python-mechanize-and-BeautifulSoup/

Beautiful Soup

from future import print_function from bs4 import BeautifulSoup import sys import requests import time

path = 'keywords.txt' keywords_file = open(path,'r') keywords = keywords_file.readlines() rowFile = [x.replace('\n', '') for x in keywords] headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} write_path = 'searchOutput.txt' output_file = open(write_path,'w')

def parseTable(tables): result = [] for table in tables: table_rows = table.find_all('tr') for tr in table_rows: td = tr.find_all('td') row = [i.get_text() for i in td] rowUtf = [x.encode('utf-8') for x in row] rowClean = [x.replace('\n', '') for x in rowUtf] result.append(rowClean) return result

for item in rowFile: searchQuery = item.replace(' ','-') url = 'https://www.zauba.com/import-%s-hs-code.html' % searchQuery time.sleep(10) while True: page = requests.get(url, headers=headers, timeout=5) if (page.status_code == 200): break soup = BeautifulSoup(page.content, 'html.parser') tables = soup.findAll('table') mainList = parseTable(tables) output = [] delim = "\t" for item in mainList: if len(item)>5: log = open("/searchOutput.txt", "a") print(item[1],delim,item[2], file = log)

rkwahile commented 6 years ago

Scrapy Web Crawling

import scrapy import json from scrapy.selector import Selector from scrapy.http import HtmlResponse from requests import Request from scrapy.http import FormRequest from scrapy.contrib.loader import XPathItemLoader

class QuotesSpider(scrapy.Spider): name = "crawl"

    def start_requests(self):
            path = '3ceKeywords.txt'
            keywords_file = open(path,'r')
            keywords =  keywords_file.readlines()
            rowFile = [x.replace('\n', '') for x in keywords]
            for proddesc in rowFile:
                    print (proddesc)
            #proddesc=input("Enter the commodity you want to search")
                    formdata={"state":"start","proddesc":"%s" % proddesc,"lang":"en","username":"NOT_SET","userData":{"correlationTxId":""},"destination":"US","origin":"US","schedule":"import/export","profileId":"57471f0c4ac2c9b910000000"}
                    request_body=json.dumps(formdata)
                    print(request_body)
                    yield scrapy.Request("https://uscensus.prod.3ceonline.com/ui/classify",callback=self.parse_output,method="POST",body=request_body,headers={'Content-Type': 'application/json; charset=UTF-8'})

    def parse_output(self,response):
            filename='final_output'
            with open(filename,'wb') as f:
                    f.write(response.body)
                    self.log('Saved file %s' % filename)
            data = json.load(open('myscrapyproject/final_output'))
            print(data)
            prodDesc=data["productDescription"]
            potential=data["potentialHeadings"]
            currentItem=data["currentItemPaths"]
            print(data)
            print(str(prodDesc) + ' ' +str(potential)+ ' '+str(currentItem))
            print("-----------------------------------")
            print(type(data))