yanyiwu / cppjieba

"结巴"中文分词的C++版本
MIT License
2.61k stars 690 forks source link

实现了关键词提取的textrank算法,请yanyi check一下。另外,idf的方法和textrank哪个更优? #41

Closed CasyWang closed 1 month ago

CasyWang commented 9 years ago
/*
 * TextRank.hpp
 *
 *  Created on: 2015年7月7日
 *      Author: oliverlwang
 */
#ifndef TEXTRANK_H_
#define TEXTRANK_H_

#include "UndirectWeightedGraph.hpp"

namespace CppJieba
{

class TextRank
{
public:
    TextRank() : _span(2) {};
    virtual ~TextRank(){};

    /**
     * @brief get the TopN Keywords.
     *
     * @param vector words input words
     * @param vector keywords keywords with score
     * @param int topN how many keywords do you want
     *
     * @retval
     */
    int textRank(vector<string>& words, map<string, double>& wordmap)
    {
        try
        {
            UndirectWeightedGraph graph;
            map< pair<string, string>, double> cm;

            for(size_t i = 0; i < words.size(); ++i)
            {
                /* syntactic filter */

                /* ngram, when span=2, f-measure gets best result */
                for(size_t j = i + 1; j < i + _span; ++j)
                {
                    if(j >= words.size())
                        break;
                    /* using std::pair as union key */
                    pair<string, string> key = make_pair(words[i], words[j]);
                    cm[key] += 1.0;
                }
            }

            /* add edge */
            for(map< pair<string, string>, double>::iterator it = cm.begin(); it != cm.end(); ++it)
            {
                /* do not add edge between the same vertex */
                if(it->first.first == it->first.second)
                    continue;

                graph.addEdge(it->first.first, it->first.second, it->second);
            }

            /* rank */
            graph.rank();

            wordmap.clear();
            wordmap = graph.ws;
        }
        catch(exception &e)
        {
            cerr << e.what() << endl;
            return -1;
        }
        return 0;
    }

private:
    int _span;             /* scanning span */

};

} /* namespace CppJieba */
#endif /* TEXTRANK_H_ */

/*
 * UndirectWeightedGraph.hpp
 *
 *  Created on: 2015年7月7日
 *      Author: oliverlwang
 */

#ifndef UNDIRECTWEIGHTEDGRAPH_H_
#define UNDIRECTWEIGHTEDGRAPH_H_

#include <iostream>
#include <algorithm>
#include <vector>
#include <map>
#include <set>

using namespace std;

namespace CppJieba
{

/* edge type */
struct edge_t
{
    string start;
    string end;
    double weight;
};

class UndirectWeightedGraph
{
public:
    UndirectWeightedGraph(): _dampingFactor(0.85){};
    virtual ~UndirectWeightedGraph(){};

    /**
     * @brief add an edge for the UndirectedWeighted Graph
     *
     * @param string &start
     * @param  string &end
     * @param
     *
     * @retval
     */
    void addEdge(const string &start, const string &end, double weight)
    {
        /* add an out edge for vertex start */
        edge_t _edge;
        _edge.start = start;
        _edge.end = end;
        _edge.weight = weight;

        _graph[start].push_back(_edge);

        /* add an out edge for vertex end */
        _edge.start = end;
        _edge.end = start;

        _graph[end].push_back(_edge);
    }

    /**
     * @brief rank the words according to its score
     *
     * @param none
     *
     * @retval none
     */
    void rank()
    {
        map<string, double> outSum;

        /* initialize words score */
        double wsdef = (_graph.size() > 0) ? (1.0 / _graph.size()) : 1.0;

        for(map<string, vector<edge_t> >::iterator it = _graph.begin(); it != _graph.end(); ++it)
        {
            ws[it->first] = wsdef;
            outSum[it->first] = weightOutSum(it->second);
        }

        /* iterator 10 times */
        for(int i = 0; i < 10; ++i)
        {
            /* stl map is sorted by key by default */
            for(map<string, vector<edge_t> >::iterator i = _graph.begin(); i != _graph.end(); ++i)
            {
                double score = 0.0;
                for(vector<edge_t>::iterator j = i->second.begin(); j != i->second.end(); ++j)
                {
                    score += j->weight / outSum[j->end] * ws[j->end];
                }

                ws[i->first] = (1.0 - _dampingFactor) + _dampingFactor * score;
            }
        }

        /* normalize */
        double max_rank = max_element(ws.begin(), ws.end())->second;
        double min_rank = min_element(ws.begin(), ws.end())->second;

        for(map<string, double>::iterator m = ws.begin(); m != ws.end(); ++m)
        {
            m->second = (m->second - min_rank / 10.0) / (max_rank - min_rank / 10.0);
        }
    }

public:
    /* words score */
    map<string, double> ws;

private:
    /* Graph, key: vertex which is a words or term, value: In(vertex) */
      map<string, vector<edge_t> > _graph;

    /* d is the damping factor that can be set between 0 and 1, always set to 0.85 */
    double _dampingFactor;

private:
    /* calculate the weight sum of out edge */
    double weightOutSum(const vector<edge_t>& v)
    {
        double sum = 0.0;
        for(vector<edge_t>::const_iterator it = v.begin(); it != v.end(); ++it)
        {
            sum += it->weight;
        }
        return sum;
    }

};

}/* namespace CppJieba */

#endif /* UNDIRECTWEIGHTEDGRAPH_H_ */
yanyiwu commented 9 years ago

非常感谢! 不过如果能作为一次pull request提交是否更好一些?

w32zhong commented 8 years ago

TextRank的计算复杂度那么高,想必提出来一定是有效果上的优势的。

github-actions[bot] commented 2 months ago

This issue has not been updated for over 5 years and will be marked as stale. If the issue still exists, please comment or update the issue, otherwise it will be closed after 7 days.

github-actions[bot] commented 1 month ago

This issue has been automatically closed due to inactivity. If the issue still exists, please reopen it.