svick / Wikipedia-SQL-dump-parser

Simple library to parse Wikipedia's SQL dumps without the need to import them into database
22 stars 10 forks source link

Obtaining lists of articles belonging to categories (categoryLinks) #1

Closed gg4u closed 12 years ago

gg4u commented 12 years ago

I don't think it is an issue but I am trying to figure out how to do it - hope you consider it a milestone to reach for this great parser which is saving much time rather than importing dump sql in mysql.

I need to find out and print a list of all the articles' links belonging to Categories in Categorylinks table.

I tried to adapt the following code from ListArticleToArticleLinks example in MONO but I got errors in variables declarations. I am new in using MONO and I am trying to understand how to do it.

Sm could please help?

using System; using System.Linq; using WpSqlDumpParser; using WpSqlDumpParser.EntityCollections; using WpSqlDumpParser.IO; using System.IO;

namespace ListArticleToArticleLinks { static class Program { static void Main()

    {

        DownloadCategories();

    }

    static void DownloadLinks() {

    // check download dumps
        DownloadStream.Log = true;

        // path, where the dumps will be downloaded
        CachingStream.CachePath = @"/Users/gg4u/";

        // we won't need other pages, so there's no need to load them into memory

        //This is for ns=0;
        Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Article);

        //This is for ns=14 Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.FindNamespaceById(14));

        var pageLinks = PageLinks.Instance.Get("enwiki", DumpsManager.GetLastDumpDate("enwiki"));

        var articleToArticleLinks =
            pageLinks.Where(
                pl => pl.From != null // because of page limiter above, this will give only links from ....
                      && pl.ToNamespace == Namespaces.FindNamespaceById(14)); // only links to ....

        using (var writer = new StreamWriter("articleTocategory.txt"))
        {
            foreach (var link in articleToArticleLinks)
            writer.WriteLine("{0}\t{1}\t{2}", link.FromId,link.From.Title, link.ToTitle);
        }

    }

    static void DownloadCategories() {

          // check download dumps

              DownloadStream.Log = true; 

        // path, where the dumps will be downloaded

        CachingStream.CachePath = @"/Users/gg4u/"; 

        // we won't need other pages, so there's no need to load them into memory

         //This is for ns=0; Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Article); 

        //This is for ns=14; Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.FindNamespaceById(14)); 
        // Don't need a limiter for categorylinks table;  Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Article);  

        var categoryLinks = categoryLinks.Instance.Get("enwiki", DumpsManager.GetLastDumpDate("enwiki")); 

        var pagesBelongToCategory =

            categoryLinks.Where(

                cl => cl.From != null); // because of page limiter above, this will give only links from ....

                // don't need the following line
                //    && pl.ToNamespace == Namespaces.FindNamespaceById(14)); // only links to .... 

        using (var writer = new StreamWriter("pagesBelongToCategory.txt")) 

            // in table http://www.mediawiki.org/wiki/Manual:Categorylinks_table I have: cl_from, cl_to, cl_type

    {

        foreach (var link in pagesBelongToCategory)

        writer.WriteLine("{0}\t{1}\t{2}", link.FromId,link.ToCategory, link.Type);

    }

    }

}

}