I don't think it is an issue but I am trying to figure out how to do it - hope you consider it a milestone to reach for this great parser which is saving much time rather than importing dump sql in mysql.
I need to find out and print a list of all the articles' links belonging to Categories in Categorylinks table.
I tried to adapt the following code from ListArticleToArticleLinks example in MONO but I got errors in variables declarations. I am new in using MONO and I am trying to understand how to do it.
Sm could please help?
using System;
using System.Linq;
using WpSqlDumpParser;
using WpSqlDumpParser.EntityCollections;
using WpSqlDumpParser.IO;
using System.IO;
namespace ListArticleToArticleLinks
{
static class Program
{
static void Main()
{
DownloadCategories();
}
static void DownloadLinks() {
// check download dumps
DownloadStream.Log = true;
// path, where the dumps will be downloaded
CachingStream.CachePath = @"/Users/gg4u/";
// we won't need other pages, so there's no need to load them into memory
//This is for ns=0;
Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Article);
//This is for ns=14 Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.FindNamespaceById(14));
var pageLinks = PageLinks.Instance.Get("enwiki", DumpsManager.GetLastDumpDate("enwiki"));
var articleToArticleLinks =
pageLinks.Where(
pl => pl.From != null // because of page limiter above, this will give only links from ....
&& pl.ToNamespace == Namespaces.FindNamespaceById(14)); // only links to ....
using (var writer = new StreamWriter("articleTocategory.txt"))
{
foreach (var link in articleToArticleLinks)
writer.WriteLine("{0}\t{1}\t{2}", link.FromId,link.From.Title, link.ToTitle);
}
}
static void DownloadCategories() {
// check download dumps
DownloadStream.Log = true;
// path, where the dumps will be downloaded
CachingStream.CachePath = @"/Users/gg4u/";
// we won't need other pages, so there's no need to load them into memory
//This is for ns=0; Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Article);
//This is for ns=14; Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.FindNamespaceById(14));
// Don't need a limiter for categorylinks table; Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Article);
var categoryLinks = categoryLinks.Instance.Get("enwiki", DumpsManager.GetLastDumpDate("enwiki"));
var pagesBelongToCategory =
categoryLinks.Where(
cl => cl.From != null); // because of page limiter above, this will give only links from ....
// don't need the following line
// && pl.ToNamespace == Namespaces.FindNamespaceById(14)); // only links to ....
using (var writer = new StreamWriter("pagesBelongToCategory.txt"))
// in table http://www.mediawiki.org/wiki/Manual:Categorylinks_table I have: cl_from, cl_to, cl_type
{
foreach (var link in pagesBelongToCategory)
writer.WriteLine("{0}\t{1}\t{2}", link.FromId,link.ToCategory, link.Type);
}
}
}
I don't think it is an issue but I am trying to figure out how to do it - hope you consider it a milestone to reach for this great parser which is saving much time rather than importing dump sql in mysql.
I need to find out and print a list of all the articles' links belonging to Categories in Categorylinks table.
I tried to adapt the following code from ListArticleToArticleLinks example in MONO but I got errors in variables declarations. I am new in using MONO and I am trying to understand how to do it.
Sm could please help?
using System; using System.Linq; using WpSqlDumpParser; using WpSqlDumpParser.EntityCollections; using WpSqlDumpParser.IO; using System.IO;
namespace ListArticleToArticleLinks { static class Program { static void Main()
}