ZephyrFBD / AI-powered-files-system-and-the-searching-framework

1 stars 0 forks source link

umap_Visualization_labled_all.py #2

Open ZephyrFBD opened 4 months ago

ZephyrFBD commented 4 months ago

(venv) family@nate-hp-g5:~/AI-powered-files-system-and-the-searching-framwork$ /home/family/AI-powered-files-system-and-the-searching-framwork/venv/bin/python /home/family/AI-powered-files-system-and-the-searching-framwork/umap_Visualization_labledall.py Total Progress: 42%|████████████████████████████████████████████████████ | 20/48 [00:00<00:00, 99.88it/s]/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/umap/umap.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting randomstate. Use no seed for parallelism.") /home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/umap/umap.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting randomstate. Use no seed for parallelism.") /home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/umap/umap.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting randomstate. Use no seed for parallelism.") /home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/umap/umap.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting randomstate. Use no seed for parallelism.") /home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/umap/umap.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting randomstate. Use no seed for parallelism.") /home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/umap/umap.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.") joblib.externals.loky.process_executor._RemoteTraceback: """ Traceback (most recent call last): File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker r = call_item() ^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py", line 291, in call return self.fn(*self.args, self.kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 598, in call return [func(*args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/umap_Visualization_labled_all.py", line 38, in process_model model = Doc2Vec.load(model_path) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/gensim/models/doc2vec.py", line 809, in load return super(Doc2Vec, cls).load(args, rethrow=True, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/gensim/models/word2vec.py", line 1953, in load model = super(Word2Vec, cls).load(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/gensim/utils.py", line 486, in load obj = unpickle(fname) ^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/gensim/utils.py", line 1461, in unpickle return _pickle.load(f, encoding='latin1') # needed because loading from S3 doesn't support readline() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ _pickle.UnpicklingError: unpickling stack underflow """

The above exception was the direct cause of the following exception:

Traceback (most recent call last): File "/home/family/AI-powered-files-system-and-the-searching-framwork/umap_Visualization_labled_all.py", line 66, in results = Parallel(n_jobs=num_cores)( ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 2007, in call return output if self.return_generator else list(output) ^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 1650, in _get_outputs yield from self._retrieve() File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 1754, in _retrieve self._raise_error_fast() File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 1789, in _raise_error_fast error_job.get_result(self.timeout) File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 745, in get_result return self._return_or_raise() ^^^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 763, in _return_or_raise raise self._result _pickle.UnpicklingError: unpickling stack underflow

ZephyrFBD commented 4 months ago
import os
import pandas as pd
from gensim.models import Doc2Vec
import umap
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns
import logging

# 设置日志记录
logging.basicConfig(filename='./log/log.log', format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
logging.info('UMAP Visualization of Doc2Vec Models')

# 加载 CSV 文件
csv_path = './your_csv_file.csv'
df = pd.read_csv(csv_path)

# 定义模型文件夹路径
models_folder = './models/'

# 获取模型文件列表
model_files = [file for file in os.listdir(models_folder) if file.startswith('doc2vec_model')]

# UMAP 参数设置
umap_model = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)

# 循环处理每个模型
for model_file in model_files:
    model_path = os.path.join(models_folder, model_file)
    model_name = os.path.splitext(model_file)[0]

    try:
        # 加载 Doc2Vec 模型
        model = Doc2Vec.load(model_path)

        # 获取所有文档向量并转换为 NumPy 数组
        doc_vectors = np.array([model.dv[i] for i in range(len(model.dv))])

        # 使用 UMAP 进行降维
        umap_vectors = umap_model.fit_transform(doc_vectors)

        # 获取标签(使用文件路径作为标签)
        labels = df.iloc[:, 0].tolist()

        # 为标签创建颜色字典
        unique_labels = list(set(labels))
        color_palette = sns.color_palette("hsv", len(unique_labels))
        label_to_color = {label: color_palette[i] for i, label in enumerate(unique_labels)}

        # 根据标签分配颜色
        colors = [label_to_color[label] for label in labels]

        # 绘制带颜色区分的 UMAP 可视化图
        plt.figure(figsize=(10, 8))
        plt.scatter(umap_vectors[:, 0], umap_vectors[:, 1], c=colors, marker='.')
        plt.title(f'UMAP Visualization of Document Vectors for {model_name}')
        plt.xlabel('UMAP Component 1')
        plt.ylabel('UMAP Component 2')
        plt.savefig(f'./plots/{model_name}_umap.png')  # 保存图像
        plt.show()

        logging.info(f'UMAP Visualization generated for model: {model_name}')

    except Exception as e:
        error_msg = f'Error processing model {model_name}: {str(e)}'
        print(error_msg)
        logging.error(error_msg)
        continue

logging.info('UMAP Visualization of all Doc2Vec models completed')
ZephyrFBD commented 4 months ago
import os
import pandas as pd
from gensim.models import Doc2Vec
import umap
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns
import logging
from joblib import Parallel, delayed

# 设置日志记录
logging.basicConfig(filename='./log/log.log', format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
logging.info('UMAP Visualization of Doc2Vec Models')

# 加载 CSV 文件
csv_path = './your_csv_file.csv'
df = pd.read_csv(csv_path)

# 定义模型文件夹路径
models_folder = './models/'

# 获取模型文件列表
model_files = [file for file in os.listdir(models_folder) if file.startswith('doc2vec_model')]

# UMAP 参数设置
umap_model = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)

# 并行处理每个模型
num_cores = 20  # 根据您的系统配置调整核心数量
results = Parallel(n_jobs=num_cores)(
    delayed(process_model)(model_file, models_folder, umap_model, df) for model_file in model_files
)

# 处理每个模型的函数
def process_model(model_file, models_folder, umap_model, df):
    model_path = os.path.join(models_folder, model_file)
    model_name = os.path.splitext(model_file)[0]

    try:
        # 加载 Doc2Vec 模型
        model = Doc2Vec.load(model_path)

        # 获取所有文档向量并转换为 NumPy 数组
        doc_vectors = np.array([model.dv[i] for i in range(len(model.dv))])

        # 使用 UMAP 进行降维
        umap_vectors = umap_model.fit_transform(doc_vectors)

        # 获取标签(使用文件路径作为标签)
        labels = df.iloc[:, 0].tolist()

        # 为标签创建颜色字典
        unique_labels = list(set(labels))
        color_palette = sns.color_palette("hsv", len(unique_labels))
        label_to_color = {label: color_palette[i] for i, label in enumerate(unique_labels)}

        # 根据标签分配颜色
        colors = [label_to_color[label] for label in labels]

        # 绘制带颜色区分的 UMAP 可视化图
        plt.figure(figsize=(10, 8))
        plt.scatter(umap_vectors[:, 0], umap_vectors[:, 1], c=colors, marker='.')
        plt.title(f'UMAP Visualization of Document Vectors for {model_name}')
        plt.xlabel('UMAP Component 1')
        plt.ylabel('UMAP Component 2')
        plt.savefig(f'./plots/{model_name}_umap.png')  # 保存图像
        plt.show()

        logging.info(f'UMAP Visualization generated for model: {model_name}')

    except Exception as e:
        error_msg = f'Error processing model {model_name}: {str(e)}'
        print(error_msg)
        logging.error(error_msg)

# 日志记录完成
logging.info('UMAP Visualization of all Doc2Vec models completed')
ZephyrFBD commented 4 months ago
import os
import pandas as pd
from gensim.models import Doc2Vec
import umap
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns
import logging
from joblib import Parallel, delayed

# 设置日志记录
logging.basicConfig(filename='./log/log.log', format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
logging.info('UMAP Visualization of Doc2Vec Models')

# 加载 CSV 文件
csv_path = './your_csv_file.csv'
df = pd.read_csv(csv_path)

# 定义模型文件夹路径
models_folder = './models/'

# 获取模型文件列表
model_files = [file for file in os.listdir(models_folder) if file.startswith('doc2vec_model')]

# UMAP 参数设置
umap_model = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)

# 并行处理每个模型
num_cores = 20  # 根据您的系统配置调整核心数量
results = Parallel(n_jobs=num_cores)(
    delayed(process_model)(model_file, models_folder, umap_model, df) for model_file in model_files
)

# 处理每个模型的函数
def process_model(model_file, models_folder, umap_model, df):
    model_path = os.path.join(models_folder, model_file)
    model_name = os.path.splitext(model_file)[0]

    try:
        # 加载 Doc2Vec 模型
        model = Doc2Vec.load(model_path)

        # 获取所有文档向量并转换为 NumPy 数组
        doc_vectors = np.array([model.dv[i] for i in range(len(model.dv))])

        # 使用 UMAP 进行降维
        umap_vectors = umap_model.fit_transform(doc_vectors)

        # 获取标签(使用文件路径作为标签)
        labels = df.iloc[:, 0].tolist()

        # 为标签创建颜色字典
        unique_labels = list(set(labels))
        color_palette = sns.color_palette("hsv", len(unique_labels))
        label_to_color = {label: color_palette[i] for i, label in enumerate(unique_labels)}

        # 根据标签分配颜色
        colors = [label_to_color[label] for label in labels]

        # 绘制带颜色区分的 UMAP 可视化图
        plt.figure(figsize=(10, 8))
        plt.scatter(umap_vectors[:, 0], umap_vectors[:, 1], c=colors, marker='.')
        plt.title(f'UMAP Visualization of Document Vectors for {model_name}')
        plt.xlabel('UMAP Component 1')
        plt.ylabel('UMAP Component 2')
        plt.savefig(f'./out/{model_name}_umap.png')  # 保存图像到 out 文件夹
        plt.close()

        logging.info(f'UMAP Visualization generated for model: {model_name}')

    except Exception as e:
        error_msg = f'Error processing model {model_name}: {str(e)}'
        print(error_msg)
        logging.error(error_msg)

# 日志记录完成
logging.info('UMAP Visualization of all Doc2Vec models completed')
ZephyrFBD commented 4 months ago

Traceback (most recent call last): File "/home/family/AI-powered-files-system-and-the-searching-framwork/umap_Visualization_labled_all.py", line 31, in results = Parallel(n_jobs=num_cores)( ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 2007, in call return output if self.return_generator else list(output) ^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 1650, in _get_outputs yield from self._retrieve() File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 1754, in _retrieve self._raise_error_fast() File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 1789, in _raise_error_fast error_job.get_result(self.timeout) File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 745, in get_result return self._return_or_raise() ^^^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 763, in _return_or_raise raise self._result File "/home/family/AI-powered-files-system-and-the-searching-framwork/venv/lib/python3.12/site-packages/joblib/parallel.py", line 1469, in dispatch_one_batch islice = list(itertools.islice(iterator, big_batch_size)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/family/AI-powered-files-system-and-the-searching-framwork/umap_Visualization_labled_all.py", line 32, in delayed(process_model)(model_file, models_folder, umap_model, df) for model_file in model_files ^^^^^^^^^^^^^ NameError: name 'process_model' is not defined

ZephyrFBD commented 4 months ago
import os
import pandas as pd
from gensim.models import Doc2Vec
import umap
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns
import logging
from joblib import Parallel, delayed

# 设置日志记录
logging.basicConfig(filename='./log/log.log', format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
logging.info('UMAP Visualization of Doc2Vec Models')

# 加载 CSV 文件
csv_path = './your_csv_file.csv'
df = pd.read_csv(csv_path)

# 定义模型文件夹路径
models_folder = './models/'

# 获取模型文件列表
model_files = [file for file in os.listdir(models_folder) if file.startswith('doc2vec_model')]

# UMAP 参数设置
umap_model = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)

# 处理每个模型的函数
def process_model(model_file, models_folder, umap_model, df):
    model_path = os.path.join(models_folder, model_file)
    model_name = os.path.splitext(model_file)[0]

    try:
        # 加载 Doc2Vec 模型
        model = Doc2Vec.load(model_path)

        # 获取所有文档向量并转换为 NumPy 数组
        doc_vectors = np.array([model.dv[i] for i in range(len(model.dv))])

        # 使用 UMAP 进行降维
        umap_vectors = umap_model.fit_transform(doc_vectors)

        # 获取标签(使用文件路径的前16个字符作为标签)
        labels = [os.path.basename(path)[:16] for path in df.iloc[:, 0].tolist()]

        # 为标签创建颜色字典
        unique_labels = list(set(labels))
        color_palette = sns.color_palette("hsv", len(unique_labels))
        label_to_color = {label: color_palette[i] for i, label in enumerate(unique_labels)}

        # 根据标签分配颜色
        colors = [label_to_color[label] for label in labels]

        # 绘制带颜色区分的 UMAP 可视化图
        plt.figure(figsize=(10, 8))
        plt.scatter(umap_vectors[:, 0], umap_vectors[:, 1], c=colors, marker='.')
        plt.title(f'UMAP Visualization of Document Vectors for {model_name}')
        plt.xlabel('UMAP Component 1')
        plt.ylabel('UMAP Component 2')
        plt.savefig(f'./out/{model_name}_umap.png')  # 保存图像到 out 文件夹
        plt.close()

        logging.info(f'UMAP Visualization generated for model: {model_name}')

    except Exception as e:
        error_msg = f'Error processing model {model_name}: {str(e)}'
        print(error_msg)
        logging.error(error_msg)

# 并行处理每个模型
num_cores = 20  # 根据您的系统配置调整核心数量
results = Parallel(n_jobs=num_cores)(
    delayed(process_model)(model_file, models_folder, umap_model, df) for model_file in model_files
)

# 日志记录完成
logging.info('UMAP Visualization of all Doc2Vec models completed')
ZephyrFBD commented 4 months ago
import os
import pandas as pd
from gensim.models import Doc2Vec
from sklearn.manifold import TSNE  # 导入 t-SNE
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns
import logging
from joblib import Parallel, delayed

# 设置日志记录
logging.basicConfig(filename='./log/log.log', format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
logging.info('t-SNE Visualization of Doc2Vec Models')

# 加载 CSV 文件
csv_path = './your_csv_file.csv'
df = pd.read_csv(csv_path)

# 定义模型文件夹路径
models_folder = './models/'

# 获取模型文件列表
model_files = [file for file in os.listdir(models_folder) if file.startswith('doc2vec_model')]

# t-SNE 参数设置
tsne_model = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)

# 处理每个模型的函数
def process_model(model_file, models_folder, tsne_model, df):
    model_path = os.path.join(models_folder, model_file)
    model_name = os.path.splitext(model_file)[0]

    try:
        # 加载 Doc2Vec 模型
        model = Doc2Vec.load(model_path)

        # 获取所有文档向量并转换为 NumPy 数组
        doc_vectors = np.array([model.dv[i] for i in range(len(model.dv))])

        # 使用 t-SNE 进行降维
        tsne_vectors = tsne_model.fit_transform(doc_vectors)

        # 获取标签(使用文件路径的前16个字符作为标签)
        labels = [os.path.basename(path)[:16] for path in df.iloc[:, 0].tolist()]

        # 为标签创建颜色字典
        unique_labels = list(set(labels))
        color_palette = sns.color_palette("hsv", len(unique_labels))
        label_to_color = {label: color_palette[i] for i, label in enumerate(unique_labels)}

        # 根据标签分配颜色
        colors = [label_to_color[label] for label in labels]

        # 绘制带颜色区分的 t-SNE 可视化图
        plt.figure(figsize=(10, 8))
        plt.scatter(tsne_vectors[:, 0], tsne_vectors[:, 1], c=colors, marker='.')
        plt.title(f't-SNE Visualization of Document Vectors for {model_name}')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
        plt.savefig(f'./out/{model_name}_tsne.png')  # 保存图像到 out 文件夹
        plt.close()

        logging.info(f't-SNE Visualization generated for model: {model_name}')

    except Exception as e:
        error_msg = f'Error processing model {model_name}: {str(e)}'
        print(error_msg)
        logging.error(error_msg)

# 并行处理每个模型
num_cores = 20  # 根据您的系统配置调整核心数量
results = Parallel(n_jobs=num_cores)(
    delayed(process_model)(model_file, models_folder, tsne_model, df) for model_file in model_files
)

# 日志记录完成
logging.info('t-SNE Visualization of all Doc2Vec models completed')