esturdivant-usgs / science-base-automation

Automating large USGS ScienceBase data releases
4 stars 2 forks source link

All folders are converted to SB pages #29

Closed esturdivant-usgs closed 6 years ago

esturdivant-usgs commented 6 years ago

Either add flag to indicate folders that should not become pages or only create page if the folder has an XML somewhere below it.

esturdivant-usgs commented 6 years ago
def setup_subparents(sb, parentdir, landing_id, xmllist, imagefile, verbose=True):
    landing_item = sb.get_item(landing_id)
    # Initialize dictionaries
    dict_DIRtoID = {os.path.basename(parentdir): landing_id} # Initialize top dir/file:ID entry to dict
    dict_IDtoJSON = {landing_id: landing_item} # Initialize with landing page
    dict_PARtoCHILDS = {} # Initialize empty parentID:childIDs dictionary
    dirpath_list = []
    for xml_file in xmllist:
        # get relative path from parentdir to XML, including parentdir and excluding XML file
        dirpath = os.path.relpath(os.path.split(xml_file)[0], os.path.split(parentdir)[0])
        # Isolate each dir and its root and find or create its SB page.
        dirchain = splitall(dirpath)
        for i in range(0, len(dirchain)-1):
            root = dirchain[i]
            dirname = dirchain[i+1]
            # Only execute for relative paths to XML that have not already been executed (stored in dirpath_list)
            if os.path.join(root, dirname) not in dirpath_list:
                dirpath_list.append(os.path.join(root, dirname))
                # for every directory, do the following:
                parent_id = dict_DIRtoID[root] # get ID for parent
                subpage = find_or_create_child(sb, parent_id, dirname, verbose=verbose) # get JSON for subpage based on parent ID and dirname
                if not imagefile == False:
                    subpage = sb.upload_file_to_item(subpage, imagefile)
                # store values in dictionaries
                dict_DIRtoID[dirname] = subpage['id']
                dict_IDtoJSON[subpage['id']] = subpage
                dict_PARtoCHILDS.setdefault(parent_id, set()).add(subpage['id'])
    # Save dictionaries
    with open(os.path.join(parentdir,'dir_to_id.json'), 'w') as f:
        json.dump(dict_DIRtoID, f)
    with open(os.path.join(parentdir,'id_to_json.json'), 'w') as f:
        json.dump(dict_IDtoJSON, f)
    with open(os.path.join(parentdir,'parentID_to_childrenIDs.txt'), 'ab+') as f:
        pickle.dump(dict_PARtoCHILDS, f)
    return(dict_DIRtoID, dict_IDtoJSON, dict_PARtoCHILDS)