请问作者，data_read('template.txt')中template.txt文件是怎么得到的？第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到

FLxuRu commented 3 years ago

Sorry for the late reply， These are the three code snippets I wrote before, run them in orderI hope it will be useful to you！ @huhui ,@arunbaruah ,@nagsubhadeep, @Magical66 1.

# -*- coding: utf-8 -*-
"""
Created on Mon Dec 23 10:54:57 2019

@author: lidongxu1
"""
import re
import spacy
import json

def data_read(filepath):
    fp = open(filepath, "r")
    datas = []  # 存储处理后的数据
    lines = fp.readlines()  # 读取整个文件数据
    i = 0  # 为一行数据
    for line in lines:
        row = line.strip('\n') # 去除两头的换行符，按空格分割
        datas.append(row)
        i = i + 1   
    fp.close()
    return datas

def camel_to_snake(name):
    """
    # To handle more advanced cases specially (this is not reversible anymore):
    # Ref: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case  
    """
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()

def replace_all_blank(value):
    """
    去除value中的所有非字母内容，包括标点符号、空格、换行、下划线等
    :param value: 需要处理的内容
    :return: 返回处理后的内容
    # https://juejin.im/post/5d50c132f265da03de3af40b
    # \W 表示匹配非数字字母下划线
    """
    result = re.sub('\W+', ' ', value).replace("_", ' ')
    result = re.sub('\d',' ',result)
    return result
# https://github.com/explosion/spaCy
# https://github.com/hamelsmu/Seq2Seq_Tutorial/issues/1
nlp = spacy.load('en_core_web_sm')
def lemmatize_stop(text):
    """
    https://stackoverflow.com/questions/45605946/how-to-do-text-pre-processing-using-spacy
    """
#    nlp = spacy.load('en_core_web_sm')
    document = nlp(text)
    # lemmas = [token.lemma_ for token in document if not token.is_stop]
    lemmas = [token.text for token in document if not token.is_stop]
    return lemmas

def dump_2_json(dump_dict, target_path):
    '''
    :param dump_dict: submits dict
    :param target_path: json dst save path
    :return:
    '''
    class MyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)

    file = open(target_path, 'w', encoding='utf-8')
    file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
    file.close()

data = data_read('template.txt')
result = {}
for i in range(len(data)):
    temp = data[i]
    temp = camel_to_snake(temp)
    temp = replace_all_blank(temp)
    temp = " ".join(temp.split())
    temp = lemmatize_stop(temp)
    result[i] = temp
print(result)
dump_2_json(result, 'eventid2template.json')

# 单独保存需要用到的fasttext词向量
template_set = set()
for key in result.keys():
    for word in result[key]:
        template_set.add(word)

import io
from tqdm import tqdm

# https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

fasttext = load_vectors('cc.en.300.vec')

template_fasttext_map = {}

for word in template_set:
    template_fasttext_map[word] = list(fasttext[word])

dump_2_json(template_fasttext_map,'fasttext_map.json')

2.

import os
import json
import numpy as np
import pandas as pd
from collections import Counter
import math

def read_json(filename):
    with open(filename, 'r') as load_f:
        file_dict = json.load(load_f)
    return file_dict

eventid2template = read_json('eventid2template.json')
fasttext_map = read_json('fasttext_map.json')
print(eventid2template)
dataset = list()
with open('data/'+'deepLog_hdfs_train.txt', 'r') as f:
    for line in f.readlines():
        line = tuple(map(lambda n: n - 1, map(int, line.strip().split())))
        dataset.append(line)
print(len(dataset))
idf_matrix = list()
for seq in dataset:
    for event in seq:
        idf_matrix.append(eventid2template[str(event)])
print(len(idf_matrix))
idf_matrix = np.array(idf_matrix)
X_counts = []
for i in range(idf_matrix.shape[0]):
    word_counts = Counter(idf_matrix[i])
    X_counts.append(word_counts)
print(X_counts[1000])
X_df = pd.DataFrame(X_counts)
X_df = X_df.fillna(0)
print(len(X_df))
print(X_df.head())
events = X_df.columns
print(events)
X = X_df.values
num_instance, num_event = X.shape

print('tf-idf here')
df_vec = np.sum(X > 0, axis=0)
print(df_vec)
print('*'*20)
print(num_instance)
# smooth idf like sklearn
idf_vec = np.log((num_instance + 1)  / (df_vec + 1)) + 1
print(idf_vec)
idf_matrix = X * np.tile(idf_vec, (num_instance, 1))
X_new = idf_matrix
print(X_new.shape)
print(X_new[1000])

word2idf = dict()
for i,j in zip(events,idf_vec):
    word2idf[i]=j
    # smooth idf when oov
    word2idf['oov'] = (math.log((num_instance + 1)  / (29+1)) + 1)

print(word2idf)
def dump_2_json(dump_dict, target_path):
    '''
    :param dump_dict: submits dict
    :param target_path: json dst save path
    :return:
    '''
    class MyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)

    file = open(target_path, 'w', encoding='utf-8')
    file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
    file.close()

dump_2_json(word2idf,'word2idf.json')

3.

import json
import numpy as np
from collections import Counter

def read_json(filename):
    with open(filename, 'r') as load_f:
        file_dict = json.load(load_f)
    return file_dict

event2template = read_json('eventid2template.json')
fasttext = read_json('fasttext_map.json')
word2idf = read_json('word2idf.json')

event2semantic_vec = dict()
# todo :
# 计算每个seq的tf，然后计算句向量
for event in event2template.keys():
    template = event2template[event]
    tem_len = len(template)
    count = dict(Counter(template))
    for word in count.keys():
        # TF
        TF = count[word]/tem_len
        # IDF
        IDF = word2idf.get(word,word2idf['oov'])
        # print(word)
        # print(TF)
        # print(IDF)
        # print('-'*20)
        count[word] = TF*IDF
    # print(count)
    # print(sum(count.values()))
    value_sum = sum(count.values())
    for word in count.keys():
        count[word] = count[word]/value_sum
    semantic_vec = np.zeros(300)
    for word in count.keys():
        fasttext_weight = np.array(fasttext[word])
        semantic_vec += count[word]*fasttext_weight
    event2semantic_vec[event] = list(semantic_vec)
def dump_2_json(dump_dict, target_path):
    '''
    :param dump_dict: submits dict
    :param target_path: json dst save path
    :return:
    '''
    class MyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)

    file = open(target_path, 'w', encoding='utf-8')
    file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
    file.close()

dump_2_json(event2semantic_vec,'event2semantic_vec_sameoov.json')

Originally posted by @donglee-afar in https://github.com/donglee-afar/logdeep/issues/3#issuecomment-750028771

Everyday-seu commented 2 years ago

I also have this question.Do you have any idea now?

heyd7fc commented 2 years ago

我也有这个问题。你现在有什么想法吗？

Have you solved this problem now？

X-zhihao commented 1 year ago

Have you solved this problem now？

Elii-hyy commented 1 year ago

what is the "deepLog_hdfs_train.txt"?

d0ng1ee / logdeep

请问作者，data_read('template.txt')中template.txt文件是怎么得到的？第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到 #23