Closed Cartasiane closed 5 months ago
@Cartasiane that's because it's including translating text too. there is shared_text property with the orginal text only and full_text with the full text (only if the text is truncated)
Sadly no:
import json
from facebook_scraper import get_posts
from datetime import datetime
def datetime_serializer(o):
if isinstance(o, datetime):
return o.isoformat()
raise TypeError("Type not serializable")
all_posts = []
for post in get_posts(group='890901752414740', base_url="https://mbasic.facebook.com/groups",
start_url="https://mbasic.facebook.com/groups/890901752414740?v=timeline",
pages=3, cookies="cookies.txt"):
all_posts.append(post)
print(json.dumps(all_posts, default=datetime_serializer, indent=4))
[
{
"post_id": "893131828858399",
"text": "How are you today my fellow dev?\n\nHow are you today my fellow dev?How are you today my fellow dev?\n\nHow are you today my fellow dev?",
"post_text": "How are you today my fellow dev?\n\nHow are you today my fellow dev?",
"shared_text": "How are you today my fellow dev?\n\nHow are you today my fellow dev?",
"original_text": null,
"time": "2023-12-15T22:00:00",
"timestamp": null,
"image": null,
"image_lowquality": null,
"images": [],
"images_description": [],
"images_lowquality": [],
"images_lowquality_description": [],
"video": null,
"video_duration_seconds": null,
"video_height": null,
"video_id": null,
"video_quality": null,
"video_size_MB": null,
"video_thumbnail": null,
"video_watches": null,
"video_width": null,
"likes": 0,
"comments": 0,
"shares": 0,
"post_url": null,
"link": null,
"links": [],
"user_id": null,
"username": "Gilbert Gerber",
"user_url": "https://facebook.com/gilbert.rochat.165?eav=Afb1OFqbCbz9DZf4IP34ydINC-qTjdTQn4RPu6hqC_zdxuWYr9Pc784mP3jd3Mnu98E&refid=18&_ft_=encrypted_tracking_data.0AY_i6Yf249ek-nJJQN4MIeLkCagXKaS0OSgTQKO26Fr9q9_kOebjy3qZV-D5GA2thIdMmD_Trdl5KUFi9lk_f2s2eiFxcgbTrngyRlRDsGdySJIpcWUExU4hUm4eg2eBQEueircNe8XbYQTI-ibyUamITJpHzvF5Y86E1Jt4ea0QcgWL1NwY3uhA_5B-x2uXRpRrhojhjjNOzSuLN5ghAcU0nKUNvRA1oAovpMtFmiHwe9AGc8PKRbOD2okcN3WUoF1wOqH2ve6qAQ3VRKQz9-wn8u-piwBJy3vbK-uEAayg9vt98vPX1ahmSOm4EWDjxC2J7cRVAxnOHHlXKDgTcqhu-mv2HYGsVNX-ixIIT8vLPr-Bp8pCf8j6eKBJO3eEuTkaF5Dte0JazECD-lDB3W7AJbCJEg-gzxsMrS797C0kpS19lUJn5yYl-lWjxx9_9u6tSlvTBLPwOaIGflDQotn4pWR50bhq8dXK1BiJo-qSpAX3OhRpLry70ctW_7FE75croRDfGy3hzxL5Qp9O0v2Gzi5OPsmmW3IU6IXIOgKtmShLM99yiA64FNRJFoOCqASeeRkqMoQu5ZaWKGk4tXPBZmXTcwnDL-Bg849hfCBlIPdYe7zoW2bF6aQQcF3NrBr7F8sESYLkKW0l851QmqaJH8C1YVuVrL6tdivjw87QgyMAdImDbFXUinDhjBw7X1Qe_XGOEtZildMxMAiLVmfqOJQbfVPWex3x1Vjobyeb1ZKHNaa_yAceVnfIXmIR4gTiD2FsVvnzpBuEuorBCxrjPoeaUFH0hg2LBEUnqtNAPOTLfN7YnaoYnSQDuHzi53hgsfzcjfb_yJwtYbrtAwXe9uV5qg2CYAU1hMOJHMbUjIqJTMP8JUAxMMcASCZJXZsbzdi6LxY93M7XTgYBLGaQ7yzc6sN-she9CW2NVh1oni0r8EfvIw&__tn__=C-R&paipv=0",
"is_live": false,
"factcheck": null,
"shared_post_id": null,
"shared_time": null,
"shared_user_id": null,
"shared_username": null,
"shared_user_url": null,
"shared_post_url": null,
"available": true,
"comments_full": null,
"reactors": null,
"w3_fb_url": null,
"reactions": null,
"reaction_count": 0,
"with": [
{
"name": "nalah is god",
"link": "https://mbasic.facebook.com/groups/890901752414740/?refid=18&_ft_=encrypted_tracking_data.0AY_i6Yf249ek-nJJQN4MIeLkCagXKaS0OSgTQKO26Fr9q9_kOebjy3qZV-D5GA2thIdMmD_Trdl5KUFi9lk_f2s2eiFxcgbTrngyRlRDsGdySJIpcWUExU4hUm4eg2eBQEueircNe8XbYQTI-ibyUamITJpHzvF5Y86E1Jt4ea0QcgWL1NwY3uhA_5B-x2uXRpRrhojhjjNOzSuLN5ghAcU0nKUNvRA1oAovpMtFmiHwe9AGc8PKRbOD2okcN3WUoF1wOqH2ve6qAQ3VRKQz9-wn8u-piwBJy3vbK-uEAayg9vt98vPX1ahmSOm4EWDjxC2J7cRVAxnOHHlXKDgTcqhu-mv2HYGsVNX-ixIIT8vLPr-Bp8pCf8j6eKBJO3eEuTkaF5Dte0JazECD-lDB3W7AJbCJEg-gzxsMrS797C0kpS19lUJn5yYl-lWjxx9_9u6tSlvTBLPwOaIGflDQotn4pWR50bhq8dXK1BiJo-qSpAX3OhRpLry70ctW_7FE75croRDfGy3hzxL5Qp9O0v2Gzi5OPsmmW3IU6IXIOgKtmShLM99yiA64FNRJFoOCqASeeRkqMoQu5ZaWKGk4tXPBZmXTcwnDL-Bg849hfCBlIPdYe7zoW2bF6aQQcF3NrBr7F8sESYLkKW0l851QmqaJH8C1YVuVrL6tdivjw87QgyMAdImDbFXUinDhjBw7X1Qe_XGOEtZildMxMAiLVmfqOJQbfVPWex3x1Vjobyeb1ZKHNaa_yAceVnfIXmIR4gTiD2FsVvnzpBuEuorBCxrjPoeaUFH0hg2LBEUnqtNAPOTLfN7YnaoYnSQDuHzi53hgsfzcjfb_yJwtYbrtAwXe9uV5qg2CYAU1hMOJHMbUjIqJTMP8JUAxMMcASCZJXZsbzdi6LxY93M7XTgYBLGaQ7yzc6sN-she9CW2NVh1oni0r8EfvIw&__tn__=C-R&paipv=0&eav=Afbclv8L6qFP0rWzWFjE7c1QZh4h3N37d-t6uzZ3ZTREfCnSF787bpyMkCLzvr-fnVc"
}
],
"page_id": null,
"sharers": null,
"translated_text": "",
"image_id": null,
"image_ids": [],
"was_live": false,
"header": "Gilbert Gerber > \u200enalah is god"
},
{
"post_id": "890902099081372",
"text": "I'm a sad cakeI'm a sad cake",
"post_text": "I'm a sad cake",
"shared_text": "I'm a sad cake",
"original_text": null,
"time": "2023-12-12T21:36:00",
"timestamp": null,
"image": null,
"image_lowquality": "https://scontent-zrh1-1.xx.fbcdn.net/m1/v/t6/An_awEcP5a-VJkiSKC4SklmLyo8p7Q3iP5vL6HDsa_ZTJdFfRRdtUFNJfr9LXPYfMhVSkFk4hqLRcj3zU9hTsyzpPGIc4jC3fiqwidCEo8AGZ4Rq.png?ccb=10-5&oh=00_AfBEwin86zUrF3ejlqAlcOFViH8lGAw7HmjX0lb0cWP-FA&oe=65A59D52&_nc_sid=7da55a",
"images": [],
"images_description": [],
"images_lowquality": [
"https://scontent-zrh1-1.xx.fbcdn.net/m1/v/t6/An_awEcP5a-VJkiSKC4SklmLyo8p7Q3iP5vL6HDsa_ZTJdFfRRdtUFNJfr9LXPYfMhVSkFk4hqLRcj3zU9hTsyzpPGIc4jC3fiqwidCEo8AGZ4Rq.png?ccb=10-5&oh=00_AfBEwin86zUrF3ejlqAlcOFViH8lGAw7HmjX0lb0cWP-FA&oe=65A59D52&_nc_sid=7da55a"
],
"images_lowquality_description": [
null
],
"video": null,
"video_duration_seconds": null,
"video_height": null,
"video_id": null,
"video_quality": null,
"video_size_MB": null,
"video_thumbnail": null,
"video_watches": null,
"video_width": null,
"likes": 0,
"comments": 0,
"shares": 0,
"post_url": null,
"link": null,
"links": [],
"user_id": null,
"username": "Gilbert Gerber",
"user_url": "https://facebook.com/gilbert.rochat.165?eav=Afb1OFqbCbz9DZf4IP34ydINC-qTjdTQn4RPu6hqC_zdxuWYr9Pc784mP3jd3Mnu98E&refid=18&_ft_=encrypted_tracking_data.0AY_lUEC03oVnY6AvDdNnYAaEgbMslN37toZS8EpFDtbkKREAbFg7hFd6LNeiQGfWkirfZMsQFLvIEEtwA12ECtxZqv1WJmTUx4h3ULcPwDK30DGRqePi9e7gfWuY2iNQ1V490FsbrKZuZ1hlXL5UPwbWG2T0q5SlUMWMRD1muS1xuhm5guarFDNQkgVHPTPhAJixXC37hmk2CcCyEJzz0SNjUwayh_yD03qwisJHVAbfRbgDvjD4oscImEjI7JQmA5TGnEe-0ebt3wudGW2QLmKXJwOBjKQBLlEDrHUXcKLKWgvxecI2Tot8bhGJiPeC2JLnM6tXTzKt1LhR7-vaRxntdhyGfubQvPLC4-71wlu-jgAA8Uy5lhl_1rm4IOV4Ccc1ijiaHLUwMNC6MzK9GLuy8Zkk7ByKY80Ku9pPzj9CgwvVVGcEcb_5U0LuAmdCdpj6GVU_vEcQ63f6m-gZ4DAeWjGXeL6WLxjU3XTRZi6tvTJUmRaKMT-_oW01D2UX7ofsGTLc8Wa1MDv-HKVRzglkfdUK1rC1-MQalFZ2U_az86V1UgCEvT_Vwr6XOWm0Z1GW8VgCoW4P8COzLxiekVmUBVN-cHbcr7hutLHzGDE-DxIiSncsAS5tsJW2jMrrCr8MixO882uy3-YKPX15tj8bmUvA4AKF0fJI-pg_CafOmXTVNvQeXunxWZcgLtRhshnuiHSAFk9FAYFWPIsCphM_ac0Qdy4Xq-9QHijIau25K6v7BG2OdbpeAOlVUmDRgpkrZUgcVpWskfrcDjYCbI0r4kaMMpi61iG76kJ7Z4z78qQirdu427eTaW0IeHiHLpp1iphF32d1Kbc7NyYdRDzE730bnbxKiS3n3zrmM0B--Jwg6ReG8WCTbA0bSm3Wpeg5pmi4FC1EvEbC6Iw5MgT49VY6GWCmtdxTXztlKycjm4buHsWkpfqGeTB8Fqj13u_BwxP1q9zIq5DQjYmQOXOiuBga9_kRuvXh8w&__tn__=C-R&paipv=0",
"is_live": false,
"factcheck": null,
"shared_post_id": null,
"shared_time": null,
"shared_user_id": null,
"shared_username": null,
"shared_user_url": null,
"shared_post_url": null,
"available": true,
"comments_full": null,
"reactors": null,
"w3_fb_url": null,
"reactions": null,
"reaction_count": 0,
"with": [
{
"name": "nalah is god",
"link": "https://mbasic.facebook.com/groups/890901752414740/?refid=18&_ft_=encrypted_tracking_data.0AY_lUEC03oVnY6AvDdNnYAaEgbMslN37toZS8EpFDtbkKREAbFg7hFd6LNeiQGfWkirfZMsQFLvIEEtwA12ECtxZqv1WJmTUx4h3ULcPwDK30DGRqePi9e7gfWuY2iNQ1V490FsbrKZuZ1hlXL5UPwbWG2T0q5SlUMWMRD1muS1xuhm5guarFDNQkgVHPTPhAJixXC37hmk2CcCyEJzz0SNjUwayh_yD03qwisJHVAbfRbgDvjD4oscImEjI7JQmA5TGnEe-0ebt3wudGW2QLmKXJwOBjKQBLlEDrHUXcKLKWgvxecI2Tot8bhGJiPeC2JLnM6tXTzKt1LhR7-vaRxntdhyGfubQvPLC4-71wlu-jgAA8Uy5lhl_1rm4IOV4Ccc1ijiaHLUwMNC6MzK9GLuy8Zkk7ByKY80Ku9pPzj9CgwvVVGcEcb_5U0LuAmdCdpj6GVU_vEcQ63f6m-gZ4DAeWjGXeL6WLxjU3XTRZi6tvTJUmRaKMT-_oW01D2UX7ofsGTLc8Wa1MDv-HKVRzglkfdUK1rC1-MQalFZ2U_az86V1UgCEvT_Vwr6XOWm0Z1GW8VgCoW4P8COzLxiekVmUBVN-cHbcr7hutLHzGDE-DxIiSncsAS5tsJW2jMrrCr8MixO882uy3-YKPX15tj8bmUvA4AKF0fJI-pg_CafOmXTVNvQeXunxWZcgLtRhshnuiHSAFk9FAYFWPIsCphM_ac0Qdy4Xq-9QHijIau25K6v7BG2OdbpeAOlVUmDRgpkrZUgcVpWskfrcDjYCbI0r4kaMMpi61iG76kJ7Z4z78qQirdu427eTaW0IeHiHLpp1iphF32d1Kbc7NyYdRDzE730bnbxKiS3n3zrmM0B--Jwg6ReG8WCTbA0bSm3Wpeg5pmi4FC1EvEbC6Iw5MgT49VY6GWCmtdxTXztlKycjm4buHsWkpfqGeTB8Fqj13u_BwxP1q9zIq5DQjYmQOXOiuBga9_kRuvXh8w&__tn__=C-R&paipv=0&eav=AfbMHFunh_1YzzyrBijT32jNQim-4L0aB-rsnSsjRjfWYybEP_eEl6QiMBFR2jYn79E"
}
],
"page_id": null,
"sharers": null,
"translated_text": "",
"image_id": null,
"image_ids": [],
"was_live": false,
"header": "Gilbert Gerber > \u200enalah is god"
},
{
"post_id": "890901879081394",
"text": "",
"post_text": "",
"shared_text": "",
"original_text": null,
"time": "2023-12-12T21:36:00",
"timestamp": null,
"image": null,
"image_lowquality": null,
"images": [],
"images_description": [],
"images_lowquality": [],
"images_lowquality_description": [],
"video": null,
"video_duration_seconds": null,
"video_height": null,
"video_id": null,
"video_quality": null,
"video_size_MB": null,
"video_thumbnail": null,
"video_watches": null,
"video_width": null,
"likes": 0,
"comments": 0,
"shares": 0,
"post_url": null,
"link": null,
"links": [],
"user_id": null,
"username": "Gilbert Gerber",
"user_url": "https://facebook.com/gilbert.rochat.165?eav=AfYlnzzFSJwCoC3bbVfDjAS1POrD2ruIrCJy38rQ1FSyAcl9pUdpjTbPjep4txlWGoY&refid=18&_ft_=encrypted_tracking_data.0AY-H38Ulm65L5Amfe1FclgVk5hA0vnt0BBPRqKFT2Vl-FwA44EliXOJRR5kHxArDoCiZnY6QGaj58--o1xYqSt51x2Zuechb-PpgyQ-OYYGIZjp-9dotBrgfA1MOTn5-378KSjWVigWPAho8NTqBtbW8RxXGTgEY5KwgaHRsVwkyI4sa7rLvffybxX9CPrxfUbmSW34iP6MFTXoS4m8NEWCUeW5uKp_PHf3zVnpCc4ayQRYs-z3K-CquAtd0Hvxnvpk3PZWkfMc4nBWxIAsknahbmJ5GxMiLkbo1udW--V7iL7qlFS1DpZPG-0teOYqzisLZbjnkXZnYOKGoqVtjxnZxrawH1WaQP3Xds9v8_PQzbxnU9OsYFxK9MVtiNBtSQv7wFWL-ggOlA0UHqP_HPTmsUAX8WPi-W_rClozzKKYUN43YjMg5_4teNYAmGIirHKkVRgAD1Fd3Wz7vYwKlnnOiuEcA0ZlqESlz0FP4wLxivPeXEfofYJS5LaGNpOjIIeBAHkfae6kU0eAkflimATlsqaxHkCxi4EmUYUGXVfF4GBgpGxLy6Y0TnGb02GM9D1vV1hhLionJnNXIz0JeOMYgwd42RNNy0SfCJX7WByaPKss6ElwlRKs0zr0J4q6G_Ow2EHN2LreslrV0JwyvJ-UU04AM5wwGXl3uyMk8MdyFoNSzcp0_hC2mlEe8b6TOqDvYsxdlXOEN39V84k3ajgiguiFDWDB2FfQSAZ-WyEO87RVNEtOfM19MU7dvedUX8g851KR5l2CC1WvSRiIqN7eGM2ZD80YYirTqDyYBKQtlWIjHVOax89yUSTsacfIiGCY-fpBU6wldmdpsI_DwQN88Ii1Ed2s-lnbRXaLqAeVzXapong3l9zHezQp-XER2XawpLn67iR_e9Y_ly5gfF64sIV2p1LEAZTNhIQfwZvh-1C8S7Xs&__tn__=C-R&paipv=0",
"is_live": false,
"factcheck": null,
"shared_post_id": null,
"shared_time": null,
"shared_user_id": null,
"shared_username": null,
"shared_user_url": null,
"shared_post_url": null,
"available": true,
"comments_full": null,
"reactors": null,
"w3_fb_url": null,
"reactions": null,
"reaction_count": 0,
"with": [
{
"name": "nalah is god",
"link": "https://mbasic.facebook.com/groups/890901752414740/?refid=18&_ft_=encrypted_tracking_data.0AY-H38Ulm65L5Amfe1FclgVk5hA0vnt0BBPRqKFT2Vl-FwA44EliXOJRR5kHxArDoCiZnY6QGaj58--o1xYqSt51x2Zuechb-PpgyQ-OYYGIZjp-9dotBrgfA1MOTn5-378KSjWVigWPAho8NTqBtbW8RxXGTgEY5KwgaHRsVwkyI4sa7rLvffybxX9CPrxfUbmSW34iP6MFTXoS4m8NEWCUeW5uKp_PHf3zVnpCc4ayQRYs-z3K-CquAtd0Hvxnvpk3PZWkfMc4nBWxIAsknahbmJ5GxMiLkbo1udW--V7iL7qlFS1DpZPG-0teOYqzisLZbjnkXZnYOKGoqVtjxnZxrawH1WaQP3Xds9v8_PQzbxnU9OsYFxK9MVtiNBtSQv7wFWL-ggOlA0UHqP_HPTmsUAX8WPi-W_rClozzKKYUN43YjMg5_4teNYAmGIirHKkVRgAD1Fd3Wz7vYwKlnnOiuEcA0ZlqESlz0FP4wLxivPeXEfofYJS5LaGNpOjIIeBAHkfae6kU0eAkflimATlsqaxHkCxi4EmUYUGXVfF4GBgpGxLy6Y0TnGb02GM9D1vV1hhLionJnNXIz0JeOMYgwd42RNNy0SfCJX7WByaPKss6ElwlRKs0zr0J4q6G_Ow2EHN2LreslrV0JwyvJ-UU04AM5wwGXl3uyMk8MdyFoNSzcp0_hC2mlEe8b6TOqDvYsxdlXOEN39V84k3ajgiguiFDWDB2FfQSAZ-WyEO87RVNEtOfM19MU7dvedUX8g851KR5l2CC1WvSRiIqN7eGM2ZD80YYirTqDyYBKQtlWIjHVOax89yUSTsacfIiGCY-fpBU6wldmdpsI_DwQN88Ii1Ed2s-lnbRXaLqAeVzXapong3l9zHezQp-XER2XawpLn67iR_e9Y_ly5gfF64sIV2p1LEAZTNhIQfwZvh-1C8S7Xs&__tn__=C-R&paipv=0&eav=AfayEsJOZ81E1ICCX09Tq-8PQG5r0UnED5ldK-eBeg-asJ0OATt0bHAjgnaHCwjnFqs"
}
],
"page_id": null,
"sharers": null,
"translated_text": "",
"image_id": null,
"image_ids": [],
"was_live": false,
"header": "Gilbert Gerber created the group nalah is god."
}
]
@Cartasiane Thanks for the notice, i found a small issue with the elemnt finding doing a recursive job, resulting in duplicated content. I updated the repo and it should be fixed.
Feel like the bug is still here after force update, but different:
[
{
"post_id": "893131828858399",
"text": "How are you today my fellow dev?\n\nHow are you today my fellow dev?",
"post_text": "How are you today my fellow dev?\n\nHow are you today my fellow dev?",
"shared_text": "",
...}]
@moda20 the problem is that
nodes = content.find('p, header, span[role=presentation], div[data-ft]')
get twice root node
@Cartasiane i think it's a translation issue, by default the translated text and the original text are joined by the "\n\n" string. be sure that you are using the same language as the target group text. in my case i use english with english cookies and i get only one text. If that's the case you cna just split the message based on that string and get your desired text.
Again, this needs to be tested on your end too since cookies can differ. @chelishchev that's probably not the case since those element tags are all different. the case might be recursion so that 2 nodes parent and child have the end text inside of them but based on my checks, it's not the case right now with mbasic.
@moda20 I'm testing right now with mbasic For example https://mbasic.facebook.com/groups/1253641912158395/permalink/1502374187285165/
The element has html:
<div class="bj bk" data-ft="{"tn":"-R"}" id="u_0_4_9T">
<div class="bl">
<header class="bm">
<table class="m" role="presentation">
<tbody>
<tr>
<td class="n">
<div class="bn"/>
</td>
<td class="t bo">
<header><h3 class="bp bq br bs"><span><strong><a
href="/eden.asher.5?eav=AfYW3tAHLP-FLyshVu0WWsr8larqp1o2LFC4zoJzzTO_8DtDblySqv_uMwsSSpZrSnk&refid=18&__tn__=C-R&paipv=0">Eden Asher</a></strong><span
class="bt"> > </span><strong><a
href="https://mbasic.facebook.com/groups/1253641912158395/?refid=18&__tn__=C-R&paipv=0&eav=AfYMvFJ8os6y_VfINlZ-hETVD3HngVoDW-hFjrlN3reTKpK_7cYYuQwJEf9eYalhqfo">סאבלט בתל אביב -sublet tel aviv</a></strong></span>
</h3></header>
</td>
</tr>
</tbody>
</table>
</header>
<div class="bu" style="text-align: right" lang="he" dir="rtl" data-ft="{"tn":"*s"}">
<div><p>סאבלט שמתחדש בכל חודש במלון לינק בת״א.<br/> <span class="bv"><span class="bw"
style="height: 16px; width: 16px; font-size: 16px; background-image: url("https://static.xx.fbcdn.net/images/emoji.php/v9/t2d/1/16/1f4cd.png")">📍</span></span>שדרות
שאול המלך 39.<br/> *גדלי החדרים ומחיריהם משתנים ולכן המחירים לא מצויינים.<br/> מה זה כולל?<br/> •חשבונות
ע״ח המלון.<br/> •חדר כושר ללא הגבלה.<br/> •האב משרדים לשימושכם ללא הגבלה.<br/> •חדר כביסה.<br/> •חדר
מאובזר במלון עם טלויזיה, אינטרנט חופשי, ריהוט מלא.</p>
<p> עוד פרטים אצל נויה 052-823-0014</p></div>
</div>
<div class="bx by" data-ft="{"tn":"H"}">
<div class="bz ca" style="width:205px; height:205px;" data-ft="{"tn":"E"}"><a
href="/photo.php?fbid=10231144896435938&id=1538001675&set=pcb.1502374187285165&eav=AfaknYxUxgxrvY8fq-MKwAl8975vg4KbXbrCv2LQ7q95-RBYZ-TaXhBvq066MPx26DE&paipv=0&source=48&refid=18&__tn__=EH-R"
class="cb cc" style="top:0px; left:0px; width: 102px; height: 102px;"><img
src="https://scontent-ham3-1.xx.fbcdn.net/v/t39.30808-6/412105853_10231144920676544_5331894378613705525_n.jpg?stp=c0.10.110.110a_cp0_dst-jpg_e15_p110x80_q65&_nc_cat=101&ccb=1-7&_nc_sid=c8266f&efg=eyJpIjoiYiJ9&_nc_ohc=CbsCt6apgo8AX9-o-Jt&_nc_ht=scontent-ham3-1.xx&oh=00_AfBc1m0zk1ab7g6SYbuFmXfZ6dsxusNIvp3fdnic-HjvvQ&oe=6584B96A"
width="102" height="102" class="cd s" alt="May be an image of 3 people and text"/></a><a
href="/photo.php?fbid=10231144896475939&id=1538001675&set=pcb.1502374187285165&eav=AfbPdU9ZAaKEYk6YjmTUEXa12qsKQPoKdoKtABTY48bEuNFOcxa4pqs9XVxNDjgl5fo&paipv=0&source=48&refid=18&__tn__=EH-R"
class="cb cc" style="top:104px; left:0px; width: 102px; height: 101px;"><img
src="https://scontent-ham3-1.xx.fbcdn.net/v/t39.30808-6/412181863_10231144920596542_1277942914673565328_n.jpg?stp=c193.0.739.731a_cp0_dst-jpg_e15_q65_s118x118&_nc_cat=107&ccb=1-7&_nc_sid=c8266f&efg=eyJpIjoiYiJ9&_nc_ohc=SaStej6HQZ8AX8EI7zO&_nc_ht=scontent-ham3-1.xx&oh=00_AfBuxUjDMGaroJix8LchUfS50z70h-Jl9BIaRHt9Q4KkYQ&oe=658352E5"
width="102" height="101" class="cd s" alt="May be an image of bedroom and indoors"/></a><a
href="/photo.php?fbid=10231144897155956&id=1538001675&set=pcb.1502374187285165&eav=AfZEkiYUc7KVNpU9GLrpsapqrXnjhbTQLG7gYRLrifBsWclXMkvuQy6Cu2p3OCceWTU&paipv=0&source=48&refid=18&__tn__=EH-R"
class="cb cc" style="top:0px; left:104px; width: 101px; height: 68px;"><img
src="https://scontent-ham3-1.xx.fbcdn.net/v/t39.30808-6/412217802_10231144921116555_7305988851821696731_n.jpg?stp=c60.0.1050.707a_cp0_dst-jpg_e15_q65_s110x80&_nc_cat=103&ccb=1-7&_nc_sid=c8266f&efg=eyJpIjoiYiJ9&_nc_ohc=3ykJWLDjrP8AX-8G48D&_nc_ht=scontent-ham3-1.xx&oh=00_AfAgB92_0ODLZcrJDX-2oBRPicRimtwfFIGjGZATRwZVlQ&oe=658474B2"
width="101" height="68" class="cd s" alt="No photo description available."/></a><a
href="/photo.php?fbid=10231144897115955&id=1538001675&set=pcb.1502374187285165&eav=AfawcOcB3vE8Tuk4pjDa2xrlRz8ylOYhriaZfggLe_g3mWr7jqr0mWR2lA1pdI-HdmY&paipv=0&source=48&refid=18&__tn__=EH-R"
class="cb cc" style="top:70px; left:104px; width: 101px; height: 67px;"><img
src="https://scontent-ham3-1.xx.fbcdn.net/v/t39.30808-6/412140098_10231144920956551_423926303237932487_n.jpg?stp=c0.8.1170.776a_cp0_dst-jpg_e15_q65_s110x80&_nc_cat=105&ccb=1-7&_nc_sid=c8266f&efg=eyJpIjoiYiJ9&_nc_ohc=fZ_c8Qn71b8AX9W5MkV&_nc_ht=scontent-ham3-1.xx&oh=00_AfDiiTeFiESiWdifD8CYt4oJAgHVwuY302zv_tvqdDUTbA&oe=65835A35"
width="101" height="67" class="cd s"
alt="May be an image of 1 person, lighting, table, indoors and sliding door"/></a><a
href="/photo.php?fbid=10231144897755971&id=1538001675&set=pcb.1502374187285165&eav=AfagTLYzj9llla6_wcMxQa4lk7faMBNXW3_pvTI4r6nXQ5MoAW0Eza1-n5aB-37lzSc&paipv=0&source=48&refid=18&__tn__=EH-R"
class="cb cc" style="top:138px; left:104px; width: 101px; height: 67px;"><img
src="https://scontent-ham3-1.xx.fbcdn.net/v/t39.30808-6/412125841_10231144921356561_3792192829017205367_n.jpg?stp=c63.0.1044.693a_cp0_dst-jpg_e15_q65_s110x80&_nc_cat=105&ccb=1-7&_nc_sid=c8266f&efg=eyJpIjoiYiJ9&_nc_ohc=HCakhvmUmZQAX-s1Mnp&_nc_ht=scontent-ham3-1.xx&oh=00_AfDm4YX5aM3avdlXgK2473L8VUtydG37tPUvwTeYFOX1-Q&oe=65835DB9"
width="101" height="67" class="cd s" alt="May be an image of lighting and indoors"/></a></div>
<div/>
<div/>
</div>
</div>
<footer class="ce" data-ft="{"tn":"*W"}">
<div class="cf bt"><abbr>3 hrs</abbr><span aria-hidden="true"> · </span><span class="cg"><div
class="ch"><span><span class="cf">Public group</span></span></div></span></div>
<div class="cf bt"><a
href="/save/story/basic/?privacy_mutation_token=eyJ0eXBlIjowLCJjcmVhdGlvbl90aW1lIjoxNzAyODM0ODk4LCJjYWxsc2l0ZV9pZCI6MzU3ODEyNTU1MzAzNDM4fQ%3D%3D&story_id=S%3A_I1538001675%3AVK%3A1502374187285165&action=SAVE&surface=mbasic_story&mechanism=ufi_action_link&after_cursor&eav=AfYWkuH0IGTnfN3x3ifbhUZjlXB1_b_0ydxFQuBxgCdZL7zZ2ALcHMKDmdv6rDolt3Y&paipv=0&ext=1703094098&hash=AeTpwo5130ONSygAjPE&refid=18&__tn__=%2AW-R">Save</a><span
aria-hidden="true"> · </span><a
href="/nfx/basic/direct_actions/?context_str=%7B%22session_id%22%3A%22bb3aac13-c318-4e82-bf5c-ed15451ae5a6%22%2C%22support_type%22%3A%22chevron%22%2C%22type%22%3A4%2C%22story_location%22%3A%22permalink%22%2C%22entry_point%22%3A%22chevron_button%22%2C%22entry_point_uri%22%3A%22https%3A%5C%2F%5C%2Fmbasic.facebook.com%5C%2Fgroups%5C%2F1253641912158395%5C%2Fpermalink%5C%2F1502374187285165%5C%2F%22%2C%22hideable_token%22%3A%22MzQ1MDI2NzG0MDeyMDU0M61zzStxL8ovLfBNzMkJyC8ucS5KTSzJzM8LLskvqqyrMzQ1tjAwMDQzN62rM6gDAA%22%2C%22story_permalink_token%22%3A%22S%3A_I1538001675%3AVK%3A1502374187285165%22%7D&redirect_uri=https%3A%2F%2Fmbasic.facebook.com%2Fgroups%2F1253641912158395%2Fpermalink%2F1502374187285165%2F&av=100091541094541&refid=18&__tn__=%2AW-R&paipv=0&eav=AfbdNHIA2Dd1QI4_0fslYs2KAOaWmehufXZ4DamJ3uoT3-IqAx7M2NFlVj9tEq_cJtY"
class="ci">More</a></div>
</footer>
</div>
And nodes = content.find('p, header, span[role=presentation], div[data-ft]')
is not good because it'll get <div class="bu"
and <p>
inside, so we'll get doubling of content.
In my test I just change code to nodes = content.find('p, header, span[role=presentation]')
and continue test
@chelishchev I am not sure about all the different types of posts that mbasic uses, specially with groups and pages and posts per search, etc. i think i updated the library to handle this in a different place. if you can try the latest version and tell me if the issue still happens for you
@Cartasiane I think @chelishchev was right about the source of the duplicated test. i updated the Library to remove the extra selector [data-ft]
and we don't get the duplicated text anymore
Most text field are duplicated: