Venn Diagram Code - Githubissues

Disclaimer

While generating the Venn Diagram is programmatic and the value plotted is obtained from calculation, but the step from calculated data to Venn Diagram is still manual, i.e. we eyeball the count and manually type into the argument for venn3 function. This is done because I'm too lazy to create a binary encoding for categorical variable hahaha.

Result

Requirements:

Python

pip install pandas matplotlib matplotlib-venn

Data

nvdcve dataset containing all NVD's CVE data details in parquet format. Use it responsibly (use needed columns and use filter whenever needed). The data can be downloaded from our drive

Code

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib_venn import venn3
import os

plt.rcParams["figure.dpi"] = 200
NVDCVE_FILEPATH = "../data/nvdcve-1.1-all.parquet"
colnames = {
    "id": "cve_id",
    "publishedDate": "published_at",
    "impact.baseMetricV2.cvssV2.version": "cvssv2_version",
    "impact.baseMetricV2.cvssV2.baseScore": "cvssv2_score",
    "impact.baseMetricV2.severity": "cvssv2_severity",
    'impact.baseMetricV3.cvssV3.version': "cvssv3_version",
    "impact.baseMetricV3.cvssV3.baseScore": "cvssv3_score",
    "impact.baseMetricV3.cvssV3.baseSeverity": "cvssv3_severity",
}
colnames = {
    "id": "cve_id",
    "publishedDate": "published_at",
    "impact.baseMetricV2.cvssV2.version": "cvssv2_version",
    "impact.baseMetricV2.cvssV2.baseScore": "cvssv2_score",
    "impact.baseMetricV2.severity": "cvssv2_severity",
    "impact.baseMetricV3.cvssV3.version": "cvssv3_version",
    "impact.baseMetricV3.cvssV3.baseScore": "cvssv3_score",
    "impact.baseMetricV3.cvssV3.baseSeverity": "cvssv3_severity",
}
df_cve = (
    pd.read_parquet(NVDCVE_FILEPATH, columns=colnames.keys())
        .rename(columns=colnames)
        .assign(
            published_at=lambda df: pd.to_datetime(df["published_at"]), 
            cve_year=lambda df: df["cve_id"].str.split("-").str[1]
            )
        ) 

# Determining value for subset

print("Use this data to decide on the value for subsets")
print(df_cve[["cvssv2_version", "cvssv3_version", "cve_id"]].fillna("None").groupby(
    ["cvssv2_version", "cvssv3_version"]
).count())

# Draw Venn Diagram
fig, ax = plt.subplots()

# Create Outer Circle and annotation
outer_circle = plt.Circle((0, 0), 5, linestyle="dashed", color="grey", alpha=0.2)
plt.annotate("All CVEs", xy=(0, 5.2), fontsize=10, ha="center", va="center")
plt.annotate("230K", xy=(0, 4.5), fontsize=8, ha="center", va="center")

# Create venndiagram with outer circle
ax.add_patch(outer_circle)
venn = venn3(
    subsets=(73535, 0, 42070, 35995, 64480, 0, 0),
    set_labels=("CVSS V2.0", "CVSS V3.0", "CVSS V3.1"),
    set_colors=("#d0ee11", "#76c68f", "#1984c5"),
    ax=ax,
    normalize_to=40,
    subset_label_formatter=lambda x: f"{round(x/1000):,}k",
)

# Reposition the '74K' label for legibility
label = venn.get_label_by_id("100")
label.set_y(2)
label.set_x(-0.7)

# Styping the Overall Plot
ax.set_aspect("equal")
ax.set_xlim(-6, 6)
ax.set_ylim(-6, 6)
ax.set_title(
    "Komposisi dari CVE yang Dirilis, \nBerdasarkan Versi CVSS yang Dimiliki",
    fontsize=12,
)

# Display the plot
plt.show()

diardanoraihan / VulprioApp