Open okvivi opened 12 years ago
incomplet
diff --git a/www/hp-scripts/extract_party_migrations.php b/www/hp-scripts/extract_party_migrations.php
new file mode 100644
index 0000000..4548b2b
--- /dev/null
+++ b/www/hp-scripts/extract_party_migrations.php
@@ -0,0 +1,215 @@
+<?php
+define('ROOT', dirname(dirname(__FILE__)).'/');
+require(ROOT."_top.php");
+
+error_reporting(E_ALL & ~E_NOTICE & ~E_STRICT & ~E_DEPRECATED);
+
+include_once(ROOT.'hp-includes/wiki_edits.php');
+include_once(ROOT.'hp-includes/people_lib.php');
+include_once(ROOT.'hp-includes/people_util.php');
+
+$politicians = loadPeopleFromDb();
+$parties = loadPartiesFromDb();
+
+/**
+ * Searches the politicians database for the specified name and if found, returns the
+ * found person's object.
+ *
+ * @param string $name The name to search (can contain spaces).
+ * @return Person|null The database person object of the matched politician.
+ */
+function searchForPolitician($name) {
+ global $politicians;
+ if (empty($name))
+ return null;
+
+ foreach ($politicians as $person) {
+ if ($person->isSubsetOf($name))
+ return $person;
+ }
+ return null;
+}
+
+/**
+ * Searches the parties database for the specified name and if found, returns the
+ * found party's database record.
+ *
+ * @param string $name The party name to search (short or long).
+ * @return Person|null The database person object of the matched politician.
+ */
+function searchForParty($name) {
+ global $parties;
+ $search_components = preg_split('/[ _,.-]+/', strtolower($name));
+ if (empty($name))
+ return null;
+
+ foreach ($parties as $party) {
+ $party_components = preg_split('/[ _,.-]+/', $party['long_name']);
+
+ // check full name match
+ $matches = 0;
+ foreach ($party_components as $c1) {
+ foreach ($search_components as $c2) {
+ if (trim($c1) == trim($c2) && (strlen(trim($c1))>3) && (strlen(trim($c2))>3))
+ $matches++;
+ }
+ }
+ if ($matches > 2 || $matches >= count($name))
+ return $party;
+
+ // check if the name matches the party's abbreviation
+ foreach ($search_components as $c)
+ if (strtolower(trim($c)) == strtolower(trim($party['name'])))
+ return $party;
+ }
+ return null;
+}
+
+/**
+ * Partitions the array into arrays of components.
+ * The array must be numeric!
+ *
+ * @param array $array The source array.
+ * @return array Resulting array of arrays.
+ */
+function partitionArray($array) {
+ $result = array();
+ $n = count($array);
+ for ($i=0; $i<$n; $i++) {
+ for ($j=$i+1; $j<$n; $j++) {
+ $result[] = array_slice($array, $i, $j-$i);
+ }
+ }
+ return $result;
+}
+
+/**
+ * Returns whether the string is capitalized.
+ * @param string $str The string to check.
+ * @return boolean True if the string begins with an upper-case character.
+ */
+function isCapitalized($str) {
+ if (empty($str))
+ return null;
+ return (preg_match('/^[A-Z]/', $str));
+}
+
+/**
+ * Tokenizes the given words array.
+ * A token can be simple text, a person name or a party name.
+ *
+ * @param array $curname The words to parse.
+ * @return array A token object.
+ */
+function getTokenFor($curname) {
+ global $politicians, $parties;
+ if (empty($curname))
+ return null;
+
+ // find out whether the token is a person or party
+ $type = 'text';
+ $data = $curname;
+ $wordSets = partitionArray($curname);
+ foreach ($wordSets as $set) {
+ $politician = searchForPolitician(implode(' ', $set));
+ $party = searchForParty(implode(' ', $set));
+ if ($party) {
+ $type = 'party';
+ $data = $party;
+ // print_r($party);die();
+ break;
+ } elseif ($politician) {
+ $type = 'politician';
+ $data = $politician;
+ // print_r($politician);die();
+ break;
+ }
+ }
+
+ // insert current token and reset
+ if (!empty($data)) {
+ return array(
+ 'type' => $type,
+ 'words' => $data,
+ );
+ }
+ return null;
+}
+
+/**
+ * Parses the specified text and returns a list of politicians that had party migrations.
+ * The output array contains associative array items with the following keys:
+ * - politician_id
+ * - politician_name: politician's name
+ * - old_party_id: old party's ID
+ * - new_party_id: destination party ID (can be null if the politician left the party).
+ *
+ * @param string $paragraph The text to extract info from.
+ * @return array The resulting migrations array.
+ */
+function extractPartyMigrations($paragraph) {
+ global $politicians, $parties;
+
+ // split text into words and then, into tokens
+ $words = preg_split('/((^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/', $paragraph, -1, PREG_SPLIT_NO_EMPTY);
+ $curname = array(); $capital = false;
+ $tokens = array();
+ for ($i=0; $i<count($words); $i++) {
+ $name = array_merge($curname, array($words[$i]));
+ if (isCapitalized($words[$i])) {
+ if ($capital) {
+ // its ok, continue
+ } else {
+ // insert current token and reset
+ $tokens[] = array(
+ 'type' => 'text',
+ 'words' => $curname,
+ );
+ $name = array($words[$i]);
+ }
+ $capital = true;
+
+ } else {
+ if ($capital) {
+ $token = getTokenFor($curname);
+ if ($token)
+ $tokens[] = $token;
+ $name = array($words[$i]);
+
+ } else {
+ // nothing, just add to the current token
+ }
+ $capital = false;
+ }
+
+ $curname = $name;
+ }
+ // process the last token
+ $token = getTokenFor($curname);
+ if ($token)
+ $tokens[] = $token;
+
+ // TODO: we need to search through the tokens for several migration patterns, examples:
+ // <party> excludes <politician> or <politician> leaves from <party>
+ // <politicians> from <party> moves to <party> ETC.
+
+ return $tokens;
+}
+
+// open and parse the XML file
+$xml = simplexml_load_file(dirname(dirname(__FILE__)) . "/test.xml");
+foreach ($xml->item as $item) {
+ $doc = new DOMDocument();
+ @$doc->loadHTML('<?xml encoding="UTF-8">' . rawurldecode($xml->item->news_content));
+ foreach ($doc->getElementsByTagName('p') as $node) {
+ // search for migrations
+ $paragraph = $node->textContent;
+ echo $paragraph."\n";
+ print_r(extractPartyMigrations($paragraph));
+ break;
+ }
+ break;
+ echo '-----------------------'."\n\n";
+}
+
+include(ROOT."_bottom.php");
Deja harta politicii parsează și indexează știrile politice din Mediafax și Hotnews.
Codul este aici https://github.com/pistruiatul/hartapoliticii/tree/master/python/src/ro/vivi/news_parser Și de obicei este rulat prin scriptul ăsta https://github.com/pistruiatul/hartapoliticii/blob/master/tools/news_parser.sh
Ar fi foarte interesant dacă am adăuga la acest script și o funcție de detectare dacă știrea menționează că un politician a migrat între partide.
Asta implică:
Ar fi cool dacă scriptul de mai sus ar fi adus într-un stadiu în care să scrie în output, cu o anumită certitudine, faptul că o știre ar putea fi despre migrare politică. Plecând de la asta am putea pune aceste știri într-un queue de moderare.
Vivi.