lorey / mlscraper

🤖 Scrape data from HTML websites automatically by just providing examples
https://pypi.org/project/mlscraper/
1.31k stars 89 forks source link

Stackoverflow example not working #14

Closed rish-hyun closed 2 years ago

rish-hyun commented 2 years ago

This is the code

import logging

import requests

from mlscraper import SingleItemPageSample, RuleBasedSingleItemScraper

items = {
    "https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array": {
        "title": "Why is processing a sorted array faster than processing an unsorted array?"
    },
    "https://stackoverflow.com/questions/927358/how-do-i-undo-the-most-recent-local-commits-in-git": {
        "title": "How do I undo the most recent local commits in Git?"
    },
    "https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do": {
        "title": "What does the “yield” keyword do?"
    },
}

results = {url: requests.get(url) for url in items.keys()}

# train scraper
samples = [
    SingleItemPageSample(results[url].content, items[url]) for url in items.keys()
]
scraper = RuleBasedSingleItemScraper.build(samples)

print("Scraping new question")
html = requests.get(
    "https://stackoverflow.com/questions/2003505/how-do-i-delete-a-git-branch-locally-and-remotely"
).content
result = scraper.scrape(html)

print("Result: %s" % result)

Output

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-9f646dab1fca> in <module>()
     24     SingleItemPageSample(results[url].content, items[url]) for url in items.keys()
     25 ]
---> 26 scraper = RuleBasedSingleItemScraper.build(samples)
     27 
     28 print("Scraping new question")

4 frames
/usr/local/lib/python3.7/dist-packages/mlscraper/__init__.py in build(samples)
     89                     matches_per_page_right = [
     90                         len(m) == 1 and m[0].get_text() == s.item[attr]
---> 91                         for m, s in zip(matches_per_page, samples)
     92                     ]
     93                     score = sum(matches_per_page_right) / len(samples)

/usr/local/lib/python3.7/dist-packages/mlscraper/__init__.py in <listcomp>(.0)
     88                     matches_per_page = (s.page.select(selector) for s in samples)
     89                     matches_per_page_right = [
---> 90                         len(m) == 1 and m[0].get_text() == s.item[attr]
     91                         for m, s in zip(matches_per_page, samples)
     92                     ]

/usr/local/lib/python3.7/dist-packages/mlscraper/__init__.py in <genexpr>(.0)
     86                 if selector not in selector_scoring:
     87                     logging.info("testing %s (%d/%d)", selector, i, len(selectors))
---> 88                     matches_per_page = (s.page.select(selector) for s in samples)
     89                     matches_per_page_right = [
     90                         len(m) == 1 and m[0].get_text() == s.item[attr]

/usr/local/lib/python3.7/dist-packages/mlscraper/parser.py in select(self, css_selector)
     28     def select(self, css_selector):
     29         try:
---> 30             return [SoupNode(res) for res in self._soup.select(css_selector)]
     31         except NotImplementedError:
     32             logging.warning(

/usr/local/lib/python3.7/dist-packages/bs4/element.py in select(self, selector, _candidate_generator, limit)
   1495                 if tag_name == '':
   1496                     raise ValueError(
-> 1497                         "A pseudo-class must be prefixed with a tag name.")
   1498                 pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
   1499                 found = []

ValueError: A pseudo-class must be prefixed with a tag name.
lorey commented 2 years ago

Hi @rish-hyun, thanks for posting the issue. Looks to be in the beautifulsoup library. Could you post the output of pip freeze and the logs?

rish-hyun commented 2 years ago

@lorey Sorry for the late reply! It's finally working after upgrading beautifulsoup4