Closed Bishwas-py closed 1 week ago
I had the same problem and it seems there isn't any build-in functionality for that. I wrote a wrapper function that, given a list of keywords, filters out all URLs and sitemaps with said keywords. The function is a bit clunky because you have to iteratively go through sitemap_tree_for_homepage looking for sitemaps, remove those sitemaps, and then and concatenate urls from all remaining sitemaps. Code below that should work as is. You can set your keywords at the top.
def generateAllUrls(domain_url: str, debug_mode=False) -> list:
# url/sitemaps to remove if keyword is found
filter_words = ["post-sitemap", "blog", ".png", ".jpg", ".jpeg", ".gif", "tag","categor", "event", "news"]
# get sitemap tree
sitemap_tree = sitemap_tree_for_homepage(domain_url)
# sitemap tree can be nested 3 levels deep; need to unpack
# can't use get_pages() functions because we want to remove select sitemaps
sitemap_urls = []
page_urls = []
# 1. level
for sitemap in sitemap_tree.sub_sitemaps:
# 2. level
if hasattr(sitemap, "sub_sitemaps"):
for sub_sitemap in sitemap.sub_sitemaps:
# 3. level
if hasattr(sub_sitemap, "sub_sitemaps"):
for sub_sub_sitemap in sub_sitemap.sub_sitemaps:
# filter on sitemaps without keywords but pages
if not any(s in sub_sub_sitemap.url.lower() for s in filter_words) and hasattr(sub_sub_sitemap, "pages"):
sitemap_urls.append(sub_sub_sitemap.url)
page_urls.extend([page.url for page in sub_sub_sitemap.pages])
else:
# filter on sitemaps without keywords but pages
if not any(s in sub_sitemap.url.lower() for s in filter_words) and hasattr(sub_sitemap, "pages"):
sitemap_urls.append(sub_sitemap.url)
page_urls.extend([page.url for page in sub_sitemap.pages])
else:
if not any(s in sitemap.url.lower() for s in filter_words) and hasattr(sitemap, "pages"):
sitemap_urls.append(sitemap.url)
page_urls.extend([page.url for page in sitemap.pages])
# remove duplicates
sitemap_urls = set(sitemap_urls)
page_urls = set(page_urls)
# filter urls
urls_filtered = []
for url in page_urls:
if not any(s in url.lower() for s in filter_words):
urls_filtered.append(url)
if debug_mode:
print("# of sitemaps", len(sitemap_urls))
print("# of urls", "{} (filtered) {} (unfiltered)".format(len(urls_filtered), len(page_urls)))
print("sitemaps", sitemap_urls)
print("urls", urls_filtered)
return urls_filtered
Like if any sitemap/XML contains any string, like discussion, comments, category or something similar, I wanna it do be excluded from
sitemap_tree_for_homepage().all_pages()
, how to do it?