ckiplab / ckiptagger

CKIP Neural Chinese Word Segmentation, POS Tagging, and NER
GNU General Public License v3.0
1.64k stars 193 forks source link

AssertionError while running NER #38

Open Annrison opened 3 years ago

Annrison commented 3 years ago

您好,參考官網的範例寫了一個 get_nlp_result function,會 iterate data_df 的 row,將文字資料 row[text_col] 依序送進 ws、pos 和 ner function 處理

在跑ner的時候有時會遇到 AssertionError error,請問這是甚麼問題造成的呢?

def get_nlp_result(data_df, id_col, text_col):
    start = time.time()

    pos_list = []
    entity_list = []
    sentence_list = []

    for index, row in data_df.iterrows(): # document level    
#         print(f"\ndocument {index}") 

        # clean data
        result = [] 
        tmp = Sentence_Segmentation(row[text_col]) 
        flat_list = [item for sublist in tmp for item in sublist]

        # ckip
        w_sentence_list = ws(flat_list, coerce_dictionary = dictionary2) # set dictionary 
        pos_sentence_list = pos(w_sentence_list)
        entity_sentence_list = ner(w_sentence_list, pos_sentence_list)

        for i, sentence in enumerate(flat_list): # sentence level
#             print(f"sentence {i}: {sentence}")
            sentence_list.append([row[id_col],sentence])            
            temp_tokens = get_pos(row[id_col],w_sentence_list[i],  pos_sentence_list[i])
            temp_entites = get_ner(row[id_col],entity_sentence_list[i])

            pos_list.append(temp_tokens)
            if len(temp_entites) != 0:
                entity_list.append(temp_entites)

    pos_flat = [item for sublist in pos_list for item in sublist]
    entity_flat = [item for sublist in entity_list for item in sublist]

    pos_table = pd.DataFrame(data=pos_flat, 
                    columns=[id_col,'word','pos'])        

    entity_table = pd.DataFrame(data=entity_flat, 
                        columns=[id_col,'word','ner']) 

    sentence_table = pd.DataFrame(data=sentence_list, 
                    columns=[id_col,'sentence']) 

    end = time.time()
    print("time costing: {}".format(end - start))

    return pos_table, entity_table, sentence_table

image

jacobvsdanniel commented 3 years ago
try:
    entity_sentence_list = ner(word_sentence_list, pos_sentence_list)
except AssertionError:
    entity_sentence_list = []
    for word_sentence, pos_sentence in zip(word_sentence_list, pos_sentence_list):
        try:
            singleton_entity_sentence_list = ner([word_sentence], [pos_sentence])
        except AssertionError:
            singleton_entity_sentence_list = ner([word_sentence], [pos_sentence], character_normalization=False)
        entity_sentence_list.append(singleton_entity_sentence_list[0])