Closed UserWangZz closed 2 months ago
def convert_token(html_list):
"""
Convert raw html to label format
"""
token_list = ["<tbody>"]
# final html list:
for row in html_list:
token_list.append("<tr>")
for col in row:
if col == None:
continue
elif col == "td":
token_list.extend(["<td>", "</td>"])
else:
token_list.append("<td")
if "colspan" in col:
_, n = col.split("colspan=")
token_list.append(' colspan="{}"'.format(str(int(n[0]))))
if "rowspan" in col:
_, n = col.split("rowspan=")
token_list.append(' rowspan="{}"'.format(str(int(n[0]))))
token_list.extend([">", "</td>"])
token_list.append("</tr>")
token_list.append("</tbody>")
return token_list
input_html_list = [
["td", "rowspan=31"],
["td", "td"],
["td", None]
]
print(convert_token(input_html_list))
这里两种修改都是
['<tbody>', '<tr>', '<td>', '</td>', '<td', ' rowspan="3"', '>', '</td>', '</tr>', '<tr>', '<td>', '</td>', '<td>', '</td>', '</tr>', '<tr>', '<td>', '</td>', '</tr>', '</tbody>']
PPOCRLabel/libs/utils.py中的convert_token方法中 col.split()返回值n为str类型,在下方token_list.append中,只对n的第一位进行format