pistruiatul / hartapoliticii

Harta Politicii din România
http://hartapoliticii.ro
Other
62 stars 21 forks source link

Detectarea migrărilor politice din știri #31

Open okvivi opened 12 years ago

okvivi commented 12 years ago

Deja harta politicii parsează și indexează știrile politice din Mediafax și Hotnews.

Codul este aici https://github.com/pistruiatul/hartapoliticii/tree/master/python/src/ro/vivi/news_parser Și de obicei este rulat prin scriptul ăsta https://github.com/pistruiatul/hartapoliticii/blob/master/tools/news_parser.sh

Ar fi foarte interesant dacă am adăuga la acest script și o funcție de detectare dacă știrea menționează că un politician a migrat între partide.

Asta implică:

Ar fi cool dacă scriptul de mai sus ar fi adus într-un stadiu în care să scrie în output, cu o anumită certitudine, faptul că o știre ar putea fi despre migrare politică. Plecând de la asta am putea pune aceste știri într-un queue de moderare.

Vivi.

niflostancu commented 12 years ago

incomplet

diff --git a/www/hp-scripts/extract_party_migrations.php b/www/hp-scripts/extract_party_migrations.php
new file mode 100644
index 0000000..4548b2b
--- /dev/null
+++ b/www/hp-scripts/extract_party_migrations.php
@@ -0,0 +1,215 @@
+<?php
+define('ROOT', dirname(dirname(__FILE__)).'/');
+require(ROOT."_top.php");
+
+error_reporting(E_ALL & ~E_NOTICE & ~E_STRICT & ~E_DEPRECATED);
+
+include_once(ROOT.'hp-includes/wiki_edits.php');
+include_once(ROOT.'hp-includes/people_lib.php');
+include_once(ROOT.'hp-includes/people_util.php');
+
+$politicians = loadPeopleFromDb();
+$parties = loadPartiesFromDb();
+
+/**
+ * Searches the politicians database for the specified name and if found, returns the 
+ * found person's object.
+ * 
+ * @param  string $name The name to search (can contain spaces).
+ * @return Person|null The database person object of the matched politician.
+ */
+function searchForPolitician($name) {
+   global $politicians;
+   if (empty($name)) 
+       return null;
+   
+   foreach ($politicians as $person) {
+       if ($person->isSubsetOf($name)) 
+           return $person;
+   }
+   return null;
+}
+
+/**
+ * Searches the parties database for the specified name and if found, returns the 
+ * found party's database record.
+ * 
+ * @param  string $name The party name to search (short or long).
+ * @return Person|null The database person object of the matched politician.
+ */
+function searchForParty($name) {
+   global $parties;
+   $search_components = preg_split('/[ _,.-]+/', strtolower($name));
+   if (empty($name)) 
+       return null;
+   
+   foreach ($parties as $party) {
+       $party_components = preg_split('/[ _,.-]+/', $party['long_name']);
+       
+       // check full name match
+       $matches = 0;
+       foreach ($party_components as $c1) {
+           foreach ($search_components as $c2) {
+               if (trim($c1) == trim($c2) && (strlen(trim($c1))>3) && (strlen(trim($c2))>3)) 
+                   $matches++;
+           }
+       }
+       if ($matches > 2 || $matches >= count($name)) 
+           return $party;
+       
+       // check if the name matches the party's abbreviation
+       foreach ($search_components as $c) 
+           if (strtolower(trim($c)) == strtolower(trim($party['name']))) 
+               return $party;
+   }
+   return null;
+}
+
+/**
+ * Partitions the array into arrays of components.
+ * The array must be numeric!
+ * 
+ * @param  array $array The source array.
+ * @return array Resulting array of arrays.
+ */
+function partitionArray($array) {
+   $result = array();
+   $n = count($array);
+   for ($i=0; $i<$n; $i++) {
+       for ($j=$i+1; $j<$n; $j++) {
+           $result[] = array_slice($array, $i, $j-$i);
+       }
+   }
+   return $result;
+}
+
+/**
+ * Returns whether the string is capitalized.
+ * @param  string  $str The string to check.
+ * @return boolean True if the string begins with an upper-case character.
+ */
+function isCapitalized($str) {
+   if (empty($str))
+       return null;
+   return (preg_match('/^[A-Z]/', $str));
+}
+
+/**
+ * Tokenizes the given words array.
+ * A token can be simple text, a person name or a party name.
+ * 
+ * @param  array $curname The words to parse.
+ * @return array A token object.
+ */
+function getTokenFor($curname) {
+   global $politicians, $parties;
+   if (empty($curname)) 
+       return null;
+   
+   // find out whether the token is a person or party
+   $type = 'text';
+   $data = $curname;
+   $wordSets = partitionArray($curname);
+   foreach ($wordSets as $set) {
+       $politician = searchForPolitician(implode(' ', $set));
+       $party = searchForParty(implode(' ', $set));
+       if ($party) {
+           $type = 'party';
+           $data = $party;
+           // print_r($party);die();
+           break;
+       } elseif ($politician) {
+           $type = 'politician';
+           $data = $politician;
+           // print_r($politician);die();
+           break;
+       }
+   }
+   
+   // insert current token and reset
+   if (!empty($data)) {
+       return array(
+               'type' => $type, 
+               'words' => $data,
+           );
+   }
+   return null;
+}
+
+/**
+ * Parses the specified text and returns a list of politicians that had party migrations.
+ * The output array contains associative array items with the following keys:
+ * - politician_id
+ * - politician_name: politician's name
+ * - old_party_id: old party's ID
+ * - new_party_id: destination party ID (can be null if the politician left the party).
+ * 
+ * @param  string $paragraph The text to extract info from.
+ * @return array The resulting migrations array.
+ */
+function extractPartyMigrations($paragraph) {
+   global $politicians, $parties;
+   
+   // split text into words and then, into tokens
+   $words = preg_split('/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/', $paragraph, -1, PREG_SPLIT_NO_EMPTY);
+   $curname = array(); $capital = false;
+   $tokens = array();
+   for ($i=0; $i<count($words); $i++) {
+       $name = array_merge($curname, array($words[$i]));
+       if (isCapitalized($words[$i])) {
+           if ($capital) {
+               // its ok, continue
+           } else {
+               // insert current token and reset
+               $tokens[] = array(
+                       'type' => 'text', 
+                       'words' => $curname,
+                   );
+               $name = array($words[$i]);
+           }
+           $capital = true;
+           
+       } else {
+           if ($capital) {
+               $token = getTokenFor($curname);
+               if ($token)
+                   $tokens[] = $token;
+               $name = array($words[$i]);
+               
+           } else {
+               // nothing, just add to the current token
+           }
+           $capital = false;
+       }
+       
+       $curname = $name;
+   }
+   // process the last token
+   $token = getTokenFor($curname);
+   if ($token)
+       $tokens[] = $token;
+   
+   // TODO: we need to search through the tokens for several migration patterns, examples: 
+   // <party> excludes <politician> or <politician> leaves from <party>
+   // <politicians> from <party> moves to <party> ETC.
+   
+   return $tokens;
+}
+
+// open and parse the XML file
+$xml = simplexml_load_file(dirname(dirname(__FILE__)) . "/test.xml");
+foreach ($xml->item as $item) {
+   $doc = new DOMDocument();
+   @$doc->loadHTML('<?xml encoding="UTF-8">' . rawurldecode($xml->item->news_content));
+   foreach ($doc->getElementsByTagName('p') as $node) {
+       // search for migrations
+       $paragraph = $node->textContent;
+       echo $paragraph."\n";
+       print_r(extractPartyMigrations($paragraph));
+       break;
+   }
+   break;
+   echo '-----------------------'."\n\n";
+}
+
+include(ROOT."_bottom.php");