During link traversal the same resource seens to be re-requested many times in particular if I run the following script
// import { QueryEngine } from '@comunica/query-sparql';
import { getOwlClasses, getProperties, predictAllClasses } from './utils';
import { QueryEngine } from '@comunica/query-sparql-link-traversal';
import { DataFactory as DF } from 'n3';
async function main() {
const engine = new QueryEngine();
const WAYBACK_URL = 'http://wayback.archive-it.org/';
function addWayback(action: any): any {
const request = new Request(action.input, action.init);
return {
input: new Request(new URL(`/${request.url}`, WAYBACK_URL), request),
};
}
const results = await engine.queryBindings(`
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?type WHERE {
foaf:Group (rdfs:subClassOf*/^rdfs:domain/rdfs:range)* ?type
}`,
{
sources: [
"http://xmlns.com/foaf/spec/20140114.rdf"
],
lenient: true,
httpProxyHandler: {
async getProxy(request) {
const res = await fetch(request.input, { method: 'HEAD' });
console.log(res.status)
if (res.status !== 200) {
const newRequest = addWayback(request);
const res = await fetch(newRequest.input, { method: 'HEAD' });
console.log(res.status, res.url)
if (res.status === 200) {
console.log('new request', newRequest.input.toString())
return newRequest;
}
}
console.log('res', request.input.toString(), res.url)
return request;
}
}
}
)
const arr = await results.toArray();
console.log(arr.map(r => r.get('type')))
}
main();
Then as a response I get the likes of
res http://xmlns.com/foaf/0.1/interest http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/icqChatID http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/topic_interest http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/sha1 http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/workInfoHomepage http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/currentProject http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/Document http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/msnChatID http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/logo http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/theme http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/img http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/publications http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/knows http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/page http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/topic http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/pastProject http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/Agent http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/name http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/fundedBy http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/workplaceHomepage http://xmlns.com/foaf/0.1/
200
res http://xmlns.com/foaf/0.1/aimChatID http://xmlns.com/foaf/0.1/
It seems to me that it would be fairly safe in this instance to see that the response URL for http://xmlns.com/foaf/0.1/interest is http://xmlns.com/foaf/0.1/ and then use that cached resource for all future URLs starting with http://xmlns.com/foaf/0.1/ (or if you want to be safe to a headers request only on future URLs to confirm they redirect to the cached page) - I think it would also be safe to have a greater parellisation on the headers requests compared to get requests.
Issue type:
Description:
During link traversal the same resource seens to be re-requested many times in particular if I run the following script
Then as a response I get the likes of
It seems to me that it would be fairly safe in this instance to see that the response URL for
http://xmlns.com/foaf/0.1/interest
ishttp://xmlns.com/foaf/0.1/
and then use that cached resource for all future URLs starting withhttp://xmlns.com/foaf/0.1/
(or if you want to be safe to a headers request only on future URLs to confirm they redirect to the cached page) - I think it would also be safe to have a greater parellisation on the headers requests compared to get requests.Environment: