Open GuyKh opened 8 months ago
If it helps, here was my work in progress that I haven't had time to finish. myheritage.js.zip
@eljeffeg The main problem from what I can tell is the fact that the relative URLs are coming from GraphQL queries: https://familygraphql.myheritage.com/profile_header_data/ https://familygraphql.myheritage.com/profile_details_data/
e.g. for https://www.myheritage.com/profile-420551751-1500056/dov-kushnir:
{
"data": {
"profile": {
"individual": {
"family_groups": [
{
"type": "parent",
"is_parent_family": true,
"father": {
"relationship_description": "His father",
"relationship_type": "father",
"individual": {
"id": "individual-420551751-1500056",
"name": "DOV KUSHNIR",
"gender": "M",
"age_group": "A",
"lifespan": "Died: 1943",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500056\/dov-kushnir"
}
},
"mother": {
"relationship_description": "His mother",
"relationship_type": "mother",
"individual": {
"id": "individual-420551751-1500057",
"name": "GITL KUSHNIR (\u05dc\u05d1\u05d9\u05ea GUTMAN)",
"gender": "F",
"age_group": "A",
"lifespan": "Died: 1943",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500057\/gitl-kushnir-%D7%9C%D7%91%D7%99%D7%AA-gutman"
}
},
"siblings": [
{
"relationship_description": "His brother",
"relationship_type": "brother",
"individual": {
"id": "individual-420551751-1500066",
"name": "ZEEV KUSHNIR",
"gender": "M",
"age_group": "A",
"lifespan": "1911 - 2011",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500066\/zeev-kushnir"
}
},
{
"relationship_description": "His sister",
"relationship_type": "sister",
"individual": {
"id": "individual-420551751-1500060",
"name": "ROCHL KUSHNER",
"gender": "F",
"age_group": "A",
"lifespan": "Deceased",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500060\/rochl-kushner"
}
},
{
"relationship_description": "His sister",
"relationship_type": "sister",
"individual": {
"id": "individual-420551751-1500061",
"name": "TAYBL TILLIE GOLDMAN (\u05dc\u05d1\u05d9\u05ea KUSHNER)",
"gender": "F",
"age_group": "A",
"lifespan": "Deceased",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500061\/taybl-tillie-goldman-%D7%9C%D7%91%D7%99%D7%AA-kushner"
}
},
{
"relationship_description": "His brother",
"relationship_type": "brother",
"individual": {
"id": "individual-420551751-1500062",
"name": "AVRAHAM KUSHNER",
"gender": "M",
"age_group": "A",
"lifespan": "Deceased",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500062\/avraham-kushner"
}
},
{
"relationship_description": "His brother",
"relationship_type": "brother",
"individual": {
"id": "individual-420551751-1500063",
"name": "BARUCH KUSHNIR",
"gender": "M",
"age_group": "A",
"lifespan": "Deceased",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500063\/baruch-kushnir"
}
},
{
"relationship_description": "His brother",
"relationship_type": "brother",
"individual": {
"id": "individual-420551751-1500064",
"name": "SHAMAI KUSHNIR",
"gender": "M",
"age_group": "A",
"lifespan": "Died: 1943",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500064\/shamai-kushnir"
}
},
{
"relationship_description": "His brother",
"relationship_type": "brother",
"individual": {
"id": "individual-420551751-1500065",
"name": "HILEL KUSHNIR",
"gender": "M",
"age_group": "A",
"lifespan": "Deceased",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500065\/hilel-kushnir"
}
},
{
"relationship_description": "His sister",
"relationship_type": "sister",
"individual": {
"id": "individual-420551751-1500067",
"name": "HANA RABINOVICH (\u05dc\u05d1\u05d9\u05ea KUSHNIR)",
"gender": "F",
"age_group": "A",
"lifespan": "Deceased",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500067\/hana-rabinovich-%D7%9C%D7%91%D7%99%D7%AA-kushnir"
}
}
],
"spouse": null,
"children": null
},
{
"type": "spouse",
"is_parent_family": false,
"father": null,
"mother": null,
"siblings": null,
"spouse": {
"relationship_description": "His wife",
"relationship_type": "wife",
"individual": {
"id": "individual-420551751-1500014",
"name": "\u05d7\u05de\u05d3\u05d4 \u05e7\u05d5\u05e9\u05e0\u05d9\u05e8",
"gender": "F",
"age_group": "A",
"lifespan": "Deceased",
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500014\/%D7%97%D7%9E%D7%93%D7%94-%D7%A7%D7%95%D7%A9%D7%A0%D7%99%D7%A8"
}
},
"children": [
{
"relationship_description": "His son",
"relationship_type": "son",
"individual": {
"id": "individual-420551751-1500005",
"name": "<Private> <Private>",
"gender": "U",
"age_group": "A",
"lifespan": null,
"personal_photo": null,
"link_in_profile_page": "https:\/\/www.myheritage.com\/profile-420551751-1500005\/private"
}
}
]
}
],
"event_facts": {
"data": [
{
"id": "event-420551751-1500016",
"type": "BIRT",
"title": "Birth",
"is_family_fact": false,
"is_fact_of_relative": false,
"date": null,
"year": null,
"formatted_age": null,
"formatted_place": "\u05e4\u05d5\u05dc\u05d9\u05df",
"cause_of_death": null,
"content": null,
"additional_content": null,
"individual": {
"id": "individual-420551751-1500015"
},
"relative": null,
"spouse": null,
"hint": null,
"citations": {
"data": null
},
"notes": {
"data": null
},
"media": {
"data": null
}
},
{
"id": "event-420551751-1500017",
"type": "DEAT",
"title": "Death",
"is_family_fact": false,
"is_fact_of_relative": false,
"date": null,
"year": null,
"formatted_age": null,
"formatted_place": null,
"cause_of_death": null,
"content": null,
"additional_content": null,
"individual": {
"id": "individual-420551751-1500015"
},
"relative": null,
"spouse": null,
"hint": null,
"citations": {
"data": null
},
"notes": {
"data": null
},
"media": {
"data": null
}
}
]
},
"insights": {
"confirmed_record_matches_summary": null,
"consistency_issues_summary": null,
"relative_hints": null
},
"map_pins": {
"data": [
{
"location": {
"name": "Poland",
"point": {
"lat": 51.919438,
"lng": 19.145136
},
"bounds": {
"north_east": {
"lat": 54.835784,
"lng": 24.1458931
},
"south_west": {
"lat": 49.0020252,
"lng": 14.1228641
}
}
},
"facts": {
"data": [
{
"id": "event-420551751-1500016",
"is_fact_of_relative": false,
"is_family_fact": false,
"title": "Birth",
"date": null,
"formatted_place": "\u05e4\u05d5\u05dc\u05d9\u05df",
"individual": {
"id": "individual-420551751-1500015"
},
"relative": null,
"spouse": null
}
]
}
}
]
}
},
"site_membership": null,
"user": null,
"birthday_greeting": null,
"anniversary_greeting": null
}
}
}
Unfortunately, the GraphQL request is sent with a bearer_token
so it'll be quite impossible to reproduce it, and also the url (i.e. link_in_profile_page
in the response) is nowhere to be seen in the HTML)
Ideally speaking - we need to tap on these responses and everything you need is there but I'm not sure how to fetch this data from JQuery/HTML
I found this code:
function Ty(e) {
var t = e.individual
, n = e.type
, r = e.relationship;
e.familyType;
return Boolean(t) && a().createElement("div", {
className: Ti(["family_relative", n]),
onClick: function() {
K().location.href = t.link_in_profile_page
}
}
It seems that they generate the link on the onClick
event. But I have no idea how to fetch that outside
I fixed various things - One thing that's missing, I think is relatives Birth/Death which is being ignored today. and the freaking URL (which is a blocker) :/
OK @eljeffeg , I think we're in a whole new realm here.
Using React DevTools
I was able to find this React Component hierarchy:
As you can see Fragment key=family_0
is the whole "Immediate Family" list,
where for each sub-component (including the parent) - there's a component (Cy
- but I guess the name may change) which contains a prop with all the data - name, relation and link.
The question now is... it seems that we can't just fetch this from the HTML itself (as the logic happens today) - and some React analysis is required.
Here's what Google Gemini said:
Q: I'm developing a chrome extension. To show some data, I need to query the react component tree and find a Fragment component by key and fetch it's props. How to do that? A: There are two main approaches to achieve this functionality in a Chrome extension:
Content scripts:
- Content scripts are JavaScript programs that run directly within the webpage's DOM. They can access and modify the webpage's content.
Here's how you can use a content script to achieve your goal:
- **Inject a script:** Use the `chrome.tabs.executeScript` API to inject a content script into the webpage where you want to query the React component tree. - **Traverse the React tree:** Within the content script, you can access the React component hierarchy using the browser's developer tools API (`chrome.devtools.inspectedWindow.evaluate`). This API allows you to execute code within the context of the webpage. - **Find Fragment by key:** Use the `document.querySelector` function (or similar methods) to navigate the component tree based on the provided key. Keep in mind that directly querying React components by key might not always be reliable, as keys are internal implementation details. - **Fetch props:** Once you have a reference to the Fragment component, you can access its props using the standard JavaScript property access syntax (e.g., `component.props.propName`).
Background script and devtools extension:
- Background scripts run in the background process of your extension and can communicate with webpages and other scripts using message passing.
- Devtools extensions leverage the Chrome DevTools Extension API to interact with the browser's developer tools.
Here's how you can use this approach:
- **Background script communication:** The devtools panel can communicate with a background script using the `runtime.onMessage` API. - **Content script injection:** The background script can inject a content script similar to the first approach, but with the key information received from the devtools panel. - **Data retrieval and response:** The content script retrieves the Fragment's props and sends them back to the background script using message passing. - **Display data in devtools panel:** The background script receives the data from the content script and updates the devtools panel UI to display the retrieved props.
Important considerations:
- Content scripts can be limited in their ability to access and modify certain parts of the webpage due to security restrictions.
- Directly querying React components by key is not the most reliable approach as keys might not be unique or publicly exposed.
- Consider using a library like
react-devtools
(if allowed by Chrome extension guidelines) to interact with the React component tree in a more robust way.Remember that modifying the DOM or interacting with webpages requires careful consideration of security implications and potential conflicts with other extensions. Make sure your extension adheres to Chrome extension development guidelines.
Yeah, I see what you mean with https://familygraphql.myheritage.com. That's suppose to be MyHeritage's API - wonder if it's still an accessible option. For the react, a content script probably wouldn't work for this part of SmartCopy, but a background script might. We do have a background script that runs from the popup in background.js
, which can make xhr requests and such. Perhaps that would be a way to get the data.
What you have is a lot better than the failure now, so I wonder if we should just commit that and let everyone know we're still working out the dates. We might be able to at least put the years in there, since that appears to be in the data.
@eljeffeg - the problem is not the dates rather than the urls. Without it, SmartCopy (atleast today's implementation) doesn't "recognize" the family members - which is, I think, a crucial part.
I'd definitely leave the dates out, but in my opinion, the URLs are a must....
In a perfect world, I'd somehow reuse the bearer token and call the GraphQL myself (rather than parsing the HTML) and fetch all the data from there.
However, couple of thoughts on this:
On the other hand, I have no clear idea about accessing React Component's state from within an Extension.
Commenting without too much knowledge ... this is about translating the list of family members into queries that fetch the information about the family members? Is the URL you need the one for the family member's page?
Wild thought - is it possible to just follow the relative links from within the page and intercept the queries so that you can figure out what the complete URL has become? (the interception could also return a 5xx code to avoid bumping the rate limiter)? Or is the result of following the relative link a different URL from the one you need?
this is about translating the list of family members into queries that fetch the information about the family members? Is the URL you need the one for the family member's page?
The problem here is how to get the links - the way this is implemented is through React Components, when the page is built - each family member in the list has a onClick
event hander which sends the page to the relevant link (when the user clicks anywhere on the compnent).
This is unlike the previous (and standard) behavior where an object has a <a href...
tag - which is easily "read" when parsing the HTML.
The trouble here is how to get these links... React BS...
@eljeffeg as a temp solution. We can set the url as the current page url for all family members, so they'll be added but referenced to the source person and not the actual person. WDYT?
If we can add "referenced in
any chance of getting this merged? I'm still hurting from the lack of MyHeritage SmartCopy.
Still hoping to get it working with actual URLs.
As we update the parsers for particular sites, we may want to consider implementing a model based on beautifulsoup. https://beautiful-soup-4.readthedocs.io/en/latest/
@eljeffeg any references in the project?
Isn't beautiful soup is a python tool?
Ha, you're right. I work with Python so much I forgot that this project is pretty much just JS.
OK. I give up. I'll set the url
field as ""
and let's see what happens
Is this ready to merge, then?
@alvestrand - this won't do, I'm afraid. Still looking for a solution...
OK @eljeffeg , I think we're in a whole new realm here. Using
React DevTools
I was able to find this React Component hierarchy:As you can see
Fragment key=family_0
is the whole "Immediate Family" list, where for each sub-component (including the parent) - there's a component (Cy
- but I guess the name may change) which contains a prop with all the data - name, relation and link.The question now is... it seems that we can't just fetch this from the HTML itself (as the logic happens today) - and some React analysis is required.
Here's what Google Gemini said:
Q: I'm developing a chrome extension. To show some data, I need to query the react component tree and find a Fragment component by key and fetch it's props. How to do that? A: There are two main approaches to achieve this functionality in a Chrome extension:
- Content scripts:
- Content scripts are JavaScript programs that run directly within the webpage's DOM. They can access and modify the webpage's content.
Here's how you can use a content script to achieve your goal:
- **Inject a script:** Use the `chrome.tabs.executeScript` API to inject a content script into the webpage where you want to query the React component tree. - **Traverse the React tree:** Within the content script, you can access the React component hierarchy using the browser's developer tools API (`chrome.devtools.inspectedWindow.evaluate`). This API allows you to execute code within the context of the webpage. - **Find Fragment by key:** Use the `document.querySelector` function (or similar methods) to navigate the component tree based on the provided key. Keep in mind that directly querying React components by key might not always be reliable, as keys are internal implementation details. - **Fetch props:** Once you have a reference to the Fragment component, you can access its props using the standard JavaScript property access syntax (e.g., `component.props.propName`).
- Background script and devtools extension:
- Background scripts run in the background process of your extension and can communicate with webpages and other scripts using message passing.
- Devtools extensions leverage the Chrome DevTools Extension API to interact with the browser's developer tools.
Here's how you can use this approach:
- **Background script communication:** The devtools panel can communicate with a background script using the `runtime.onMessage` API. - **Content script injection:** The background script can inject a content script similar to the first approach, but with the key information received from the devtools panel. - **Data retrieval and response:** The content script retrieves the Fragment's props and sends them back to the background script using message passing. - **Display data in devtools panel:** The background script receives the data from the content script and updates the devtools panel UI to display the retrieved props.
Important considerations:
- Content scripts can be limited in their ability to access and modify certain parts of the webpage due to security restrictions.
- Directly querying React components by key is not the most reliable approach as keys might not be unique or publicly exposed.
- Consider using a library like
react-devtools
(if allowed by Chrome extension guidelines) to interact with the React component tree in a more robust way.Remember that modifying the DOM or interacting with webpages requires careful consideration of security implications and potential conflicts with other extensions. Make sure your extension adheres to Chrome extension development guidelines.
Structure was change a bit: Still no luck :(
I see that when loading - the page does a GraphQL Query:
This is for profile-171591352-1500891
and requires a Bearer Token
This could be a potential to use it - but we need to extract the BearerToken from the page somehow
Yeah, if you can use the background script, it's not too hard to call listener functions in that and wait for the response.
Yeah, if you can use the background script, it's not too hard to call listener functions in that and wait for the response.
I'd love to see an example
Still work in progress.... DO NOT MERGE YET
Solves #110