In the page_inlink.txt file, some page links (e.g. "Henry_Hutchinson" ->
"Stub") are wrong.
This is because in the page link parser, namespace is not distinguished (e.g.
some pages link to "Wikipedia:Stub" rather than "Stub").
I suggest to modify the method:
public void processPageLinksRow(PagelinksParser plParser)
in SingleDumpVersionJDK.java
from
public void processPageLinksRow(PagelinksParser plParser)
throws IOException {
int pl_from = plParser.getPlFrom();
String pl_to = plParser.getPlTo();
if (pl_to != null) {
KeyType plToHash = (KeyType) hashAlgorithm.hashCode(pl_to);
Integer pl_toValue = pNamePageIdMap.get(plToHash);
// skip redirects if skipPage is enabled
if ((!skipPage || pPageIdNameMap.containsKey(pl_from))
&& pl_toValue != null) {
pageOutlinks.addRow(pl_from, pl_toValue);
pageInlinks.addRow(pl_toValue, pl_from);
}
}
}
to
public void processPageLinksRow(PagelinksParser plParser)
throws IOException {
int pl_from = plParser.getPlFrom();
String pl_to = plParser.getPlTo();
int pl_namespace = plParser.getPlNamespace();
if (pl_to != null) {
switch (pl_namespace) {
case NS_MAIN: {
KeyType plToHash = (KeyType) hashAlgorithm.hashCode(pl_to);
Integer pl_toValue = pNamePageIdMap.get(plToHash);
// skip redirects if skipPage is enabled
if ((!skipPage || pPageIdNameMap.containsKey(pl_from))
&& pl_toValue != null) {
pageOutlinks.addRow(pl_from, pl_toValue);
pageInlinks.addRow(pl_toValue, pl_from);
}
}
}
}
}
Original issue reported on code.google.com by astronau...@gmail.com on 20 Sep 2012 at 4:24
Original issue reported on code.google.com by
astronau...@gmail.com
on 20 Sep 2012 at 4:24