Closed kenoboss closed 6 years ago
import csv
# change path to file
corpus_path = "/home/kenobi/Repos/GitHub/TextSummarization/data/"
corpus_file = "news_summary.csv"
corpus_complete_path = corpus_path + corpus_file
def get_lines (corpus_complete_path):
target = open(corpus_complete_path, newline='', encoding="windows-1250")
lines = csv.reader(target, delimiter=',', quotechar='"')
result = []
for line in lines:
result.append(line)
target.close()
return result
def clear_csv_file():
result = []
lines = get_lines(corpus_complete_path)
index = 0
for line in lines:
if index > 0:
line = clear_line(line)
result.append(line)
index += 1
return result
def clear_line(line):
result = []
for entry in line:
tmp = clear_string(entry)
result.append(tmp)
return result
def clear_string (string):
string = string.strip()
string = string.strip("\"")
string = string.strip("\'")
string = string.replace ("\"", "\'")
string = string.replace("\r\n", "")
string = "\"" + string + "\""
return string
def line_to_s (line):
result = ""
for entry in line:
result = result + entry + ";"
result = result + "\n"
return result
def create_clear_file():
lines = clear_csv_file()
filepath = corpus_path + "summary_news_clear.csv"
target = open(filepath, "w")
headline = ["author", "date", "headlines", "url", "summary", "text"]
target.write(line_to_s(headline))
for line in lines:
target.write(line_to_s(line))
target.close()
def main ():
create_clear_file()
main()
Hier ein Skript zum Aufbereiten von der Datei news_summary
Pfad zur Datei muss lediglich angepasst werden
https://github.com/sunnysai12345/News_Summary
http://labs.priberam.com/Resources/PCSC.aspx
https://github.com/WING-NUS/scisumm-corpus