gpellicci / scraper-py

Python web scraper for sofascore.it
2 stars 1 forks source link

Sofascore tennis #1

Closed skobuv1 closed 4 years ago

skobuv1 commented 4 years ago

Hey Giacomo, I like your script, it works perfectly for football. I was wandering if you could help me a bit to edit the script for tennis. I just need to get all of the matches for today on https://www.sofascore.com/tennis -(name of the players, and time, maybe also the name of the tournament). And export to JSON or Excel

But I can't figure out the how to loop through the matches :/ Would be very thankful for yout help.

import json
import os
import time
import unicodedata
import numpy

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def wait_for_ajax(driver):
wait = WebDriverWait(driver, 15)
try:
    wait.until(lambda driver: driver.execute_script('return jQuery.active') == 0)
    wait.until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
except Exception as e:
    pass

months = ["Jan", "Feb", "Mar", "Apr", "Maj", "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dec"]
months_num = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

f = open("tennis dnes.txt", "w+")
browser = webdriver.Firefox()
browser.maximize_window()
url = "https://www.sofascore.com/tennis"
start_time = time.time()
browser.get(url)

wait_for_ajax(browser)
allMathes = browser.find_element_by_xpath("//*[@id='pjax-container-main']/div/div[2]/div/div[2]/div[2]/div/div[1]")
#singleMatch = allMathes.find_element_by_tag_name("a").click()

wait_for_ajax(browser)
matches = allMathes.find_elements_by_class_name("js-event-list-tournament-events")
for m in matches:
if (m.is_displayed()):
    matches_click = m
    break
 matches_click = matches_click.find_elements_by_tag_name("a")
#loop through the matches of that week
for i in range(0, len(matches_click)):
matchStatus = str(matches_click[i].find_element_by_css_selector("div.cell__section.status").text).replace("\n", " ")

if "Canceled" not in matchStatus:
    continue
matches_click[i].click()
try:
    element = WebDriverWait(browser, 10).until(
        EC.presence_of_all_element_located((By.CLASS_NAME, "js-details-widget-container widget-container"))
    )
except:
    pass
wait_for_ajax(browser)

players = str(browser.find_element_by_css_selector("a.h-interactive.js-event-link").text).split(" - ")
data = {
    "event" : [],
    "eventHome" : [],
    "eventAway" : []
}
data['homeTeam'] = players[0]
data['awayTeam'] = players[1]

date = browser.find_element_by_class_name("js-details-component-startTime-container").find_elements_by_class_name(
    "cell__content")[0]
d = str(date.text).replace(".", "").replace(",", "")
d = d.split
isodate = d[2] + "-" + months_num[months.index(d[1])] + "-" + d[0] + "T" + d[3] + ":00:000+01:00"
data['date'] = isodate

f.close()
print("------ %s seconds ------" % (time.time() - start_time))
# browser.close()

exit(0)`
gpellicci commented 4 years ago

Hi, i'm glad you found my solution interesting. Here is some code to retrieve the data you mentioned (tournament names, match times and players)

Please note that i use chrome driver and not firefox one. But the solution should work the same.

In practice you just have to find the div you are looking for, then you travel the DOM with "find_element/elements" by tagname or by classname to access the information you need. I hope code comments can explain better how it works.

import json
import time

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait

def wait_for_ajax(driver):
wait = WebDriverWait(driver, 15)
try:
wait.until(lambda driver: driver.execute_script('return
jQuery.active') == 0)
wait.until(lambda driver: driver.execute_script('return
document.readyState') == 'complete')
except Exception as e:
pass

browser = webdriver.Chrome()
browser.maximize_window()
url = "https://www.sofascore.com/tennis"
start_time = time.time()
browser.get(url)

wait_for_ajax(browser)
allTournament =
browser.find_elements_by_xpath("//*[@id='pjax-container-main']/div/div[2]/div/div[2]/div[2]/div")
allTournament =
allTournament[0].find_elements_by_class_name("js-event-list-tournament")

data = {
"tournament" : []
}

#LOOP THE TOURNAMENTS
for t in allTournament:
#TOURNAMENT
tournament_name = t.find_element_by_class_name("tournament__name")
tournament_category =
t.find_element_by_class_name("tournament__category")

tournament = {
"match" : [],
"tournament_name" : tournament_name.text,
"tournament_category" : tournament_category.text
}

match_of_tournament =
t.find_element_by_class_name("js-event-list-tournament-events").find_elements_by_tag_name("a")
#LOOP THE MATCHES IN THE TOURNAMENT
for m in match_of_tournament:

##PLAYERS
player = m.find_elements_by_class_name("event-team")

##TIME
time = m.find_element_by_class_name("u-w48")

match = {
"time" : time.text,
"player" : []
}

for p in player:
match["player"].append(p.text)

tournament["match"].append(match)

data["tournament"].append(tournament)

##DUMP JSON INTO A FILE
json_data = json.dumps(data)
f = open("tennis.json", "w+")
f.write(json_data)
f.close()
skobuv1 commented 4 years ago

Man, thanks a lot, works like a charm :) sorry for the trouble, I just came to Python this week for one side project :) Grazie Mille