A-BigTree / searchmanage_Wiki

You can set the number of threading for querying using Wikimedia API(https://www.wikidata.org/w/api.php). And you can put N-dimensional list of entities' text or ids in it, so you can query a number of entity in one time. What's more, each entity can get results at right position and right dimension in the list.
Apache License 2.0
9 stars 1 forks source link
multithreading python wikimedia-api wikipedia-api

目录

版本说明

V0.1

V0.2

V0.3

V0.3.5

V0.4

v0.4.1

v0.4.5

v0.4.6

v0.4.7

模块视图

classDiagram
    Entities ..> AnalysisTools:依赖
    Entities ..> Tools:依赖
    EntitiesSearch ..> Tools:依赖

    Entities --|>RequestAnalysis : 继承
    EntitiesSearch ..o Entities: 集合
    SearchManage --|> EntitiesSearch :继承
    Wikipedia --|> EntitiesSearch:继承
    SparqlQuery --|> EntitiesSearch:继承
    BingQuery --|> EntitiesSearch:继承
    SpellCheck --|> EntitiesSearch:继承
    DbpediaLookUp --|> EntitiesSearch:继承

    class Tools{
    @ staticmethod
    }

    class AnalysisTools{
    @ staticmethod
    }

    class RequestAnalysis{
    - request
    - analysis
    - ready_
    }

    class Entities{
    - index
    - entities
    - params
    }

    class EntitiesSearch{
    + entities_num
    + key
    + keys
    + m_num
    + Queue[Entities]: search_queue
    + Queue[Entites]: re_queue
    + List[Entities]: re_list
    + queryParams
    - index_
    }

    class SearchManage{
    - url
    }

    class Wikipedia{

    }

    class SparqlQuery{
    - url_
    - returnFormat
    }

    class BingQuery{
    - url_
    }

    class SpellCheck{
    - url_
    }

    class DbpediaLookUp{

    }

Quickstart

\

导入与初始化

from searchmanage import SearchManage

search_m1 = SearchManage(key='search', m_num=20)
search_m2 = SearchManage(key='ids', m_num=10)
"""
url_api(str): 查询网址,默认为"https://www.wikidata.org/w/api.php";
key(str): 查询action,'search' 或者 'ids';
m_num(int): 指定线程数目;
"""

重要方法说明

    def search_run(self, points: list, keys: Union[str, List[str]] = None,
                   timeout: float = 30.0, time_stop: float = 30.0, block_num: int = 10,
                   function_=None, args: tuple = None, **kwargs) -> dict

解析关键词规范

  1. labels,descriptions,aliases

    labels为例子

    • labels,labels/,labels//→labels下字典
    • labels/xxxx语言下的值
  2. claims

    • claims,claims/,claims//→claims下字典
    • claims/P,claims/P/,claims/P//→属性ID列表
    • claims/P/value→所有属性下的值,值的格式为元组,(值类型主值1, ···)
    • claims/Pxx,claims/Pxx/,claims/Pxx//Pxx下的字典
    • claims/Pxx/valuePxx的具体值,值的格式为元组,(值类型主值1, ···)
    • claims/Pxx/qualifiers-orderPxx限定词顺序列表
    • claims/Pxx/qualifiers,claims/Pxx/qualifiers/,claims/Pxx/qualifiers//Pxx限定词下字典
    • claims/Pxx/references,claims/Pxx/references/,claims/Pxx/references//Pxx引用下字典
  3. sitelinks

    • sitelinks,sitelinks/,sitelinks//→外部链接下的字典

查询参数字典

# key = 'search'
{
    'search': # 文本,
    'action': 'wbsearchentities',
    'format': 'json',
    'language': 'en',
    'type': 'item',
    'limit': 10,
    'strictlanguage': None,
    'continue': None,
    'props': None
}

# key = 'ids'
{
    'ids': # 实体ID,
    'action': 'wbgetentities',
    'format': 'json',
    'languages': 'en',
    'redirects': None,
    'sites': None,
    'title': None,
    'props': None,
    'languagefallback': None,
    'normalize': None,
    'sitefilter': None
}

\

导入与初始化

from searchmanage import Wikipedia

w1 = Wikipedia(m_num = 10)
"""
m_num(int):指定运行线程数量
"""

重要方法说明

    def search_run(self, points: list, time_stop: float = 30.0, block_num: int = 10,
                   function_=None, args: tuple = None, **kwargs) -> list

\

导入与初始化

from searchmanage import SparqlQuery

sql = SparqlQuery(m_num = 10, format_ = 'json')
"""
m_num(int):指定运行线程数量;
format_(str):查询返回格式,默认为'json';
url_(str):查询域名节点,默认为<SparqlQuery.URL_ = "https://query.wikidata.org/sparql">;
sparql_(str): 查询sparql格式语句,默认为<SparqlQuery.SPARQL_>;
"""

重要方法说明

def search_run(self, points: list, timeout: float = 30.0, time_stop: float = 30.0,
                   block_num: int = 10, function_=None, args: tuple = None, **kwargs) -> dict

Sparql语句格式化

# 默认sparql_
SparqlQuery.SPARQL_ = """
SELECT ?item ?itemLabel 
WHERE 
{
?item wdt:P31 wd:%s.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}

\

导入与初始化

from searchmanage import BingQuery

b = BingQuery(m_num = 10)
"""
url_(str): 模糊搜索域名节点,默认为<BingQuery.URL_ = "https://www.bing.com/search">;
m_num(int):指定运行线程数量;
"""

重要方法说明

def search_run(self, points: list, timeout: float = 30.0, time_stop: float = 30.0,
                   block_num: int = 10, function_=None, args: tuple = None, **kwargs) -> list

\

导入初始化

from searchmanage import SpellCheck

sc = SpellCheck(m_num = 10)
"""
url_(str): 爬取拼写结果的域名节点,默认为<BingQuery.URL_ = "https://www.bing.com/search">;
m_num(int):指定运行线程数量;
"""

重要方法说明

def search_run(self, points: list, timeout: float = 30.0, time_stop: float = 30.0,
                   block_num: int = 10, function_=None, args: tuple = None, **kwargs) -> list

\

导入初始化

from searchmanage import DbpediaLookUp

db = DbpediaLookUp(m_num=10)
"""
key(str):
    针对实体某部分进行搜索,可选"query","label", "comment" 
    or "category", 默认为全局搜索"query";
m_num(int):指定运行线程数量;
"""

重要方法说明

def search_run(self, points: list, patten: str = "search", is_all: bool = False, timeout: float = 30.0,
                   time_stop: float = 30.0, block_num: int = 10, function_=None, args: tuple = (), **kwargs) -> dict

查询参数字典与解析键值

# 查询字典格式
PARAM_DBPEDIA_QUERY = {
    "query": None,
    "label": None,
    "comment": None,
    "category": None,
    "typeName": None,
    "maxResults": 10,
    "format": "json",
    "minRelevance": None
}
"""Parameters using in Dbpedia look up."""

# 解析键值列表,元组第一位为1表明结果为单数,为2说明结果为复数,以列表形式存在
DBPEDIA_KEYS = [(1, 'label'), (1, 'resource'), (2, 'typeName'), (2, 'type'), (1, 'score'),
                (1, 'refCount'), (1, 'comment'), (2, 'redirectlabel'), (2, 'category')]
"""Analysis keys using in Dbpedia look up json data."""

7种值类型解析主值

# Wikidata Value-type
value_type = {
    'wikibase-entityid': [1, 'id', 'entity-type', 'numeric-id'],
    'globecoordinate': [2, 'latitude', 'longitude', 'precision', 'globe'],
    'time': [1, 'time', 'precision', 'before', 'after', 'timezone'],
    'string': None,
    'monolingualtext': [2, 'text', 'language'],
    'quantity': [1, 'amount', 'lowerBound', 'upperBound']
}

14种数据类型

# Wikidata Data-type
data_type = {
    'commonsMedia': 'string',
    'globe-coordinate': 'globecoordinate',
    'wikibase-item': 'wikibase-entityid',
    'wikibase-property': 'wikibase-entityid',
    'string': 'string',
    'monolingualtext': 'monolingualtext',
    'external-id': 'string',
    'quantity': 'quantity',
    'time': 'time',
    'url': 'string',
    'math': 'string',
    'geo-shape': 'string',
    'musical-notation': 'string',
    'tabular-data': 'string',
    'wikibase-lexeme': 'wikibase-entityid',
    'wikibase-form': 'wikibase-entityid',
    'wikibase-sense': 'wikibase-entityid'
}

Example

from searchmanage import SearchManage, Wikipedia, SparqlQuery, BingQuery, Tools, SpellCheck, DbpediaLookUp

if __name__ == "__main__":
    # Read data from csv
    d, d_t = Tools.read_csv("data\\1C9LFOKN.csv", is_header=True, out_data_t=True, is_print=True)
    print(len(d_t))

    # Example data
    p1 = [['SEU', 'Chain', 'English'], ['computer', 'games', 'computer game'], ['graph', 'wikipedia'],
          ['SEU', 'Chain', 'English'], ['computer', 'games', 'computer game'], ['graph', 'wikipedia'],
          ['SEU', 'Chain', 'English'], ['computer', 'games', 'computer game'], ['graph', 'wikipedia']]
    p2 = ["Q3918", "Q355304", "Q106589826", "Q3918", "Q355304", "Q106589826", "Q3918", "Q355304",
          "Q106589826", "Q3918", "Q355304", "Q106589826"]

    # SearchManage: key = 'search'
    s1 = SearchManage(key='search', m_num=10)
    r1 = s1.search_run(p1, keys='all', limit=20)
    # print(r1)

    # SearchManage: key = 'ids'
    s2 = SearchManage(key='ids', m_num=20)
    r2 = s2.search_run(r1['id'], keys=['labels/en', 'claims/P//', 'claims/P/value'])
    # print(r2)

    # Wikipedia: <wikipedia.suggest()>
    w1 = Wikipedia(m_num=10)
    r3 = w1.search_run(p1)
    # print(r3)

    # SparqlQuery: <sparql_ = SparqlQuery.SPARQL_>
    sql1 = SparqlQuery(m_num=12, format_='json')
    r4 = sql1.search_run(p2[0:3], timeout=60)
    # print(r4)

    # BingQuery: <url_ = BingQuery.URL_>
    b1 = BingQuery(m_num=24)
    r5 = b1.search_run(p1)
    print(r5)

    p3 = [["elgant palm trre garden", "elgant palm trre", "the sotheast univrsity"],
          ["elgant", "elgant trre", "the sotheast univrssity"],
          ["elgat palm trre garden", "elgat palm trre", "the sothaast univrsity"]]
    # SpellCheck <url_ = "https://www.bing.com/search">
    sc = SpellCheck(m_num=12)
    r6 = sc.search_run(p3)
    print(r6)

    # DbpediaLookUp->"resource"
    db = DbpediaLookUp(m_num=10)
    r7 = db.search_run(p1, patten='search', is_all=False, maxResults=20)
    print(r7['resource'])

    # Dbpedia SPARQL
    end_point = "https://dbpedia.org/sparql"
    sparql_ = """
        SELECT?Type?Rtype
        WHERE{
        <%s> dbp:type ?Type;
             rdf:type ?Rtype.}
        """
    sql2 = SparqlQuery(m_num=200, format_='json', url_=end_point, sparql_=sparql_)
    r8 = sql2.search_run(r7['resource'], timeout=10000)
    print(r8['Type'])