MaterialEyes / exsclaim

A toolkit for the automatic construction of self-labeled materials imaging datasets from scientific literature
GNU General Public License v3.0
30 stars 8 forks source link

Nature HTML has updated to a new layout #28

Open WeixinGithubJiang opened 2 years ago

WeixinGithubJiang commented 2 years ago

Describe the bug current journal scraper does not work any more

To Reproduce

Expected behavior

Outputs

Environment (please complete the following information):

Additional context Overwrite the Class Nature in could be a temporary solution (/exsclaim/exsclaim/journal.py)

class Nature(JournalFamily):
    domain =        "https://www.nature.com"
    relevant =      "relevance"
    recent =        "date_desc"
    path =          "/search?q=\""
    join =          "\"%20\""
    pre_sb =        "\"&order="
    open_pre_sb =   "\"&order="
    post_sb =       "&page=1"
    article_path =  ('/articles/','')
    prepend =       ""
    extra_key =     " "

    def get_page_info(self, soup):
        ## Finds total results, page number, and total pages in article html
        ## Data exists as json inside script tag with 'data-test'='dataLayer' attr.
        display_results = soup.find("div", class_="c-list-header").find("div", class_="u-mb-0").find("span", class_="u-display-flex").find_all("span")
        totalResults = int(display_results[-1].text.split(" ")[0])
        start_page = 1
        try:
            results = soup.find("ul", class_="c-pagination").find_all("li", class_="c-pagination__item")
            page_ids = [int(r["data-page"]) for r in results if r.get("data-page") is not None and r.get("data-page").isdigit()]
            totalPages = max(page_ids)
        except:
            totalPages = 1
        return start_page, totalPages, totalResults

    def turn_page(self, url, pg_num, pg_size):
        return url.split('&page=')[0]+'&page='+str(pg_num)

    def get_license(self, soup):
        data_layer = soup.find(attrs = {'data-test': 'dataLayer'})
        data_layer_string = str(data_layer.string)
        data_layer_json = "{" + data_layer_string.split("[{", 1)[1].split("}];", 1)[0] + "}"
        parsed = json.loads(data_layer_json)
        ## try to get whether the journal is open
        try:
            is_open = parsed["content"]["attributes"]["copyright"]["open"]
        except:
            is_open = False
        ## try to get license
        try:
            license = parsed["content"]["attributes"]["copyright"]["legacy"]["webtrendsLicenceType"]
        except:
            license = "unknown"
        return is_open, license

    def is_link_to_open_article(self, tag):
        if tag.find("span", attrs = {'data-test': 'open-access'}) is not None:
            return True
        return False

    def get_article_extensions(self, articles_visited=set()) -> list:
        """
        Create a list of article url extensions from search_query

        Returns:
            A list of article url extensions from search.
        """
        search_query = self.search_query
#         print("search query: ", search_query)
        maximum_scraped = search_query["maximum_scraped"]
        article_delim, reader_delims = self.get_article_delimiters()
        search_query_urls = self.get_search_query_urls()
#         print("search query urls: ", search_query_urls)
        article_paths = set()
        for page1 in search_query_urls:
            self.logger.info("GET request: {}".format(page1))
            print("GET request: {}".format(page1))
            soup = self.get_soup_from_request(page1, fast_load=True)
            start_page, stop_page, total_articles = self.get_page_info(soup)
            print("start_page={}, stop_page={}, total_articles={}".format(start_page, stop_page, total_articles))
            for page_number in range(start_page, stop_page + 1):
                request = self.turn_page(page1, page_number, total_articles)
                soup = self.get_soup_from_request(request, fast_load=False)
                for r in soup.find_all("article", class_="u-full-height c-card c-card--flush"):
                    article = r.find("a", href=True).get("href")
                    if article is not None and article.split("/")[-1] not in articles_visited and not (self.open and self.is_link_to_open_article(r) is not None):
                        article_paths.add(article)
                    if len(article_paths) >= maximum_scraped:
                        return list(article_paths)
                print("page_number={}, num_of_articles={}".format(page_number, len(article_paths)))

        return list(article_paths)