Open SoloHombs opened 8 months ago
`from bs4 import BeautifulSoup import requests import pandas as pd Hombs=[] # lets create a list of items to be put in our dataframe current_page=1 proceed=True while(proceed): # loop through to find the pages with the books
# lets paste the website that we want to scrap
web="https://books.toscrape.com/catalogue/page-"+str(current_page) +'.html' # so that we can check if the entered page is not 404
# now lets get the website using get() & request
naming_page=requests.get(web)
#print(naming_page.text) to see the structure of the website
# now lets use soup method to extract the parts that we only need
muto=BeautifulSoup(naming_page.text,'html.parser')
#print(muto.text.title)
if muto.title.text=='404 Not Found': # if its found then we proceed
proceed=False
else: # this is going to do the actual data extraction if falls
all_books=muto.find_all('li',class_='col-xs-6 col-sm-4 col-md-3 col-lg-3')
for book_iterm in all_books:
item={} # creating a dictionary
item['Title']=book_iterm.find('img').attrs['alt'] # in everybook we want the alt/alternate tag
item['Link']="https://books.toscrape.com/catalogue/page-" +book_iterm.find('a').attrs['href'] # attribute that we want is now href
item['price']=book_iterm.find('p',class_='price_color').text [1:] # we also want the attribute for price
item['stock']=book_iterm.find('p',class_='instock availability').text.strip() # we also want the attribute for stock
Hombs.append(item)
# print(item['price']) # we want to remove other text on price the price using split method
current_page+=1
#proceed=False # this helps to avoid the code to run all 50 pages on the website
final=pd.DataFrame(Hombs) final.to_excel('Mabook.xlsx',index=False) final.to_csv('Mabook.CSV',index=False) # now lets create a CSV file we use index to remove all index
`
Web Scraping Using Python.pdf Mabook.CSV Mabook.xlsx