polyanskiy / refractiveindex.info-database

Database of optical constants
Creative Commons Zero v1.0 Universal
162 stars 57 forks source link

Some files in the database are not in the index #4

Closed tnorth closed 9 years ago

tnorth commented 9 years ago

These files are on disk, but no in library.yml:

Checks were performed with the Python script below.

Typical output:

File number mismatch: 1508 files in the index, 1441 files on disk
1436 unique files in the index, 1441 unique files on disk

database/glass/schott/N-BK7.yml appears 3 times in the index
[...]

Files in one set but not the other: 
database/main/Ti/Rakic.yml
database/glass/schott/P-SF69.yml
database/organic/C2H6O2 - ethylene glycol/Sani2.yml
database/organic/C2H6O2 - ethylene glycol/Sani1.yml
database/main/Ag/Sthrenberg.yml

All files of the index have their counterpart on disk.
Some files on disk are not in the index :
database/main/Ag/Sthrenberg.yml
database/glass/schott/P-SF69.yml
database/organic/C2H6O2 - ethylene glycol/Sani2.yml
database/main/Ti/Rakic.yml
database/organic/C2H6O2 - ethylene glycol/Sani1.yml
import yaml

import os
import fnmatch
from collections import Counter

if __name__ == "__main__":
    path = "database"
    db = "library.yml"

    ## List all YML files to process, recursively.
    yaml_files = [os.path.join(dirpath, f)
                    for dirpath, dirnames, files in os.walk(path)
                    for f in fnmatch.filter(files, '*.yml')
                 ]
    # Skip index
    yaml_files.remove("database/library.yml")

    ## Load index
    data = yaml.load(open(os.path.join(path, db), 'r').read())
    ct = []
    indexed_files = []

    #5 main categories, ordered: main, organic, glasses, other, 3D
   for cat in data:
        cat_shelf = cat["SHELF"]
        cat_name = cat["name"]
        cat_content = cat["content"]
        ct.append({"name":cat_shelf, "desc":cat_name, "content":{}})
        # Each category has several books
        divider = "root"
        ct[-1]["content"][divider] = []
            for cat in data:
        cat_shelf = cat["SHELF"]
        cat_name = cat["name"]
        cat_content = cat["content"]
        ct.append({"name":cat_shelf, "desc":cat_name, "content":{}})
        # Each category has several books
        divider = "root"
        ct[-1]["content"][divider] = []
        for book in cat_content:
            if "DIVIDER" in book:
                divider = book["DIVIDER"]
                ct[-1]["content"][divider] = []
            elif "BOOK" in book:
                book_cat = book["BOOK"]
                book_name = book["name"]
                ct[-1]["content"][divider].append({"book_cat":book_cat,
                                    "book_name":book_name,
                                    "book_page":{}})
                subpage = "root"
                ct[-1]["content"][divider][-1]["book_page"][subpage] = []
                for page in book["content"]:
                    if "DIVIDER" in page:
                        subpage = page["DIVIDER"]
                        ct[-1]["content"][divider][-1]["book_page"][subpage] = []
                    else:
                        page_auth = page["PAGE"]
                        page_name = page["name"]
                        page_path = os.path.join(path, page["path"])
                        ct[-1]["content"][divider][-1]["book_page"][subpage].append({
                                "page_auth":page_auth,
                                "page_name":page_name,
                                "page_path":page_path
                                })
                        indexed_files.append(page_path)

    yml_files_num = len(yaml_files)
    yml_files_indexed_num = len(indexed_files)
    if yml_files_indexed_num != yml_files_num:
        print("File number mismatch: {} files in the index, {} files on disk".format(yml_files_indexed_num, yml_files_num))
    unique_files = set(yaml_files)
    unique_files_index = set(indexed_files)

    print("{} unique files in the index, {} unique files on disk".format(len(unique_files_index), len(unique_files)))
    print("")
    counts = Counter(indexed_files)
    most_referenced = counts.most_common(yml_files_indexed_num - yml_files_num)
    for name, value in most_referenced:
        if value <= 1: continue
        print("{} appears {} times in the index".format(name, value))

    print("")
    files_intersection = unique_files.symmetric_difference(unique_files_index)
    print("Files in one set but not the other: ")
    for ff in files_intersection:
        print(ff)

    print("")
    if (unique_files_index <= unique_files):
        # Files on disk are a subset of the files in the index
        print("All files of the index have their counterpart on disk.")

        diff = unique_files.difference(unique_files_index)
        if len(diff) > 0:
            print("Some files on disk are not in the index :")
            for ff in diff:
                print(ff)
    else:
        print("Some files of the index are not on disk :")
        for ff in unique_files_index.difference(unique_files):
            print(ff)
polyanskiy commented 9 years ago

fixed!