Closed kylemclaren closed 1 year ago
Yeah, I need to using id
as a variable name, it's bug-prone.
I had to make some additional changes:
diff --git a/forum_dl/extractors/discourse.py b/forum_dl/extractors/discourse.py
index 29cbf6b..3ae78ff 100644
--- a/forum_dl/extractors/discourse.py
+++ b/forum_dl/extractors/discourse.py
@@ -78,11 +78,11 @@ class DiscourseExtractor(Extractor):
for category_data in site_json["categories"]:
if "parent_category_id" not in category_data:
- id = str(category_data["id"])
+ category_id = str(category_data["id"])
self._set_board(
- path=(id,),
- url=urljoin(self.base_url, f"c/{category_data['slug']}/{id}"),
+ path=(category_id,),
+ url=urljoin(self.base_url, f"c/{category_data['slug']}/{category_id}"),
origin=response.url,
data=category_data,
title=category_data["name"],
@@ -92,12 +92,12 @@ class DiscourseExtractor(Extractor):
for category_data in site_json["categories"]:
if "parent_category_id" in category_data:
slug = category_data["slug"]
- id = str(category_data["id"])
+ category_id = str(category_data["id"])
parent_id = str(category_data["parent_category_id"])
self._set_board(
- path=(parent_id, id),
- url=urljoin(self.base_url, f"c/{slug}/{id}"),
+ path=(parent_id, category_id),
+ url=urljoin(self.base_url, f"c/{slug}/{category_id}"),
origin=response.url,
data=category_data,
title=category_data["name"],
@@ -127,8 +127,8 @@ class DiscourseExtractor(Extractor):
if subboard.data["slug"] == slug:
return subboard
elif url_parts[0] == "t":
- id = url_parts[1]
- json_url = urljoin(self.base_url, f"t/{id}.json")
+ topic_id = url_parts[1]
+ json_url = urljoin(self.base_url, f"t/{topic_id}.json")
response = self._session.get(json_url)
data = response.json()
@@ -136,11 +136,11 @@ class DiscourseExtractor(Extractor):
category_id = str(data["category_id"])
if category_id in self._subboards[self.root.path]:
- path = (category_id, f"{id}")
+ path = (category_id, f"{topic_id}")
else:
for _, subboard in self._subboards[self.root.path].items():
if category_id in self._subboards[self.root.path]:
- path = subboard.path + (category_id, f"{id}")
+ path = subboard.path + (category_id, f"{topic_id}")
break
else:
raise ValueError
@@ -178,7 +178,7 @@ class DiscourseExtractor(Extractor):
topic_id = str(data["id"])
yield Thread(
path=board.path + (topic_id,),
- url=urljoin(self.base_url, f"t/{data['slug']}/{id}"),
+ url=urljoin(self.base_url, f"t/{data['slug']}/{topic_id}"),
origin=response.url,
data=data,
title=data["title"],
Thanks for the patch. I've renamed all the variables named id
to something else in 8d691e24f67a6851aef99370318e10dd80df005d just now.
Hey @mikwielgus thanks for this project as I have been looking for a way to crawl a large Discourse site. Having the below problem and not sure if it is a Python bug or PEBCAK.
The issue is that the URL has
<built-in function id>
after the topic slug instead of the item path/id. I guess it has to do with this line which appears to clash with the built-in Python id function...