gbinal / playground

bits and pieces
http://gbinal.github.io/playground
0 stars 1 forks source link

test #10

Open gbinal opened 8 years ago

gbinal commented 8 years ago
import csv
import re
import json
​
def readData(inputFile):
    outList = []
    with open(inputFile, 'rU') as infile:
        reader = csv.reader(infile)
        firstRow = True
        for row in reader:
            if firstRow == True:
                firstRow = False
                continue
            else:
                outList.append(row)
    return outList
​
def writeJson(inputData, fileName):
    with open(fileName, 'w+') as outfile:
        json.dump(inputData, outfile, indent = 4)
​
def makeAgencyOutput(inputList, errorDict, errorTypeDict):
    output = []
    for row in inputList:
        subSet = row[1]
        subDict = collections.OrderedDict({})
        subDict['Agency'] = row[0]
        subDict['Errors'] = errorDict[row[0]]
        for key, value in errorTypeDict.items():
            k = key
            try:
                subDict[k] = subSet[value]
            except KeyError:
                subDict[k] = 0
            except TypeError:
                subDict[k] = 0
        output.append(subDict)
    return output
​
def getKey(item): 
    return item[0]
​
def trimErrorField(errorField):
    pieces = re.split('.*(Guideline)', errorField)
    shortened = pieces[-1]
    pieces = shortened.split('.')
    num = pieces[0]
    return num
​
def categorize(dataset, referenceDict, colNum, altName):
    for row in dataset:
        if row[colNum] in referenceDict.keys():
            row.append(referenceDict[row[colNum]])
        else:
            row.append(altName)
    return dataset
​
def countDict(dataset, colIndex):
    output = {}
    for row in dataset:
        if row[colIndex] in output:
            output[row[colIndex]] += 1
        else:
            output[row[colIndex]] = 1
    return output
​
#Read in a11y.csv for errors and domains.csv for agencies
ally1 = readData('a11y.csv')
domains = readData('domains.csv')
#need to remove ussm.gov, whistleblower.gov, and safeocs.gov from ally due to discrepancies between the datasets. Solve at some point
ally = []
for row in ally1:
    if row[0] != 'safeocs.gov' and row[0] != 'whistleblower.gov' and row[0] != 'ussm.gov':
        ally.append(row)
​
#Truncate the a11y file so that it's a bit more manageable. Need the domain name [0] and the principle [4]
main = []
for row in ally:
    main.append([row[0], trimErrorField(row[4])])
​
#Add the information on the agency [1] and branch [2]
for error in main:
    for domain in domains:
        if error[0] == domain[0].lower():
            error.append(domain[1])
            error.append(domain[2])
​
#Dictionaries; branches = branch lookup, errorCats = error category lookup
branches = {"Library of Congress":"Legislative","The Legislative Branch (Congress)":"Legislative",
"Government Printing Office":"Legislative","Congressional Office of Compliance":"Legislative",
"The Judicial Branch (Courts)":"Judicial"}
errorCats = {'1_4':'Color Contrast Error', '1_1':'Alt Tag Error', '4_1':'HTML/Attribute Error', '1_3':'Form Error'}
​
#define branches for the 'main' and 'domains' sets, define error categories for 'main'
main = categorize(main, branches, -1, 'Executive')
domains = categorize(domains, branches, 2, 'Executive')
main = categorize(main, errorCats, 1, 'Other Error')
​
totalErrorsByDomain = countDict(main, 0)
totalErrorsByAgency = countDict(main, 3)
​
#createe dict of base vs. canonical domains
canonicals = {}
for row in ally:
    try:
        if row[0] in canonicals.keys():
            continue
        else:
            canonicals[row[0]] = row[1]
    except KeyError:
        continue
​
​
noErrors = []
errors = []
for domain in domains:
    if not domain[0].lower() in totalErrorsByDomain.keys():
        noErrors.append(domain)
    else:
        errors.append(domain)
​
for row in noErrors:
    row.append(0)
    row.append({})
    try:
        if row[0] in canonicals.keys():
            row.append('http://' + canonicals[row[0].lower()])
        else:
            row.append('http://' + row[0].lower())
    except TypeError:
        continue
​
for row in errors:
    row.append(totalErrorsByDomain[row[0].lower()])
    subset = []
    for line in main:
        if line[0] == row[0].lower():
            subset.append(line)
    errorDict = countDict(subset, -1)
    row.append(errorDict)
    try:
        if row[0] in canonicals.keys():
            row.append('http://' + canonicals[row[0].lower()])
        else:
            row.append('http://' + row[0].lower())
    except TypeError:
        continue
​
domains = errors + noErrors
domains = sorted(domains, key = getKey)
​
dictList = []
for row in domains:
    subDict = collections.OrderedDict({})
    subDict['agency'] = row[2]
    subDict['branch'] = row[5]
    subDict['canonical'] = row[8]
    subDict['domain'] = row[0].lower()
    subDict['errors'] = row[6]
    subDict['errorlist'] = row[7]
    dictList.append(subDict)
​
finalDict = {}
finalDict['data'] = dictList
​
writeJson(finalDict, 'domains.json')
​
agencyList = []
for row in main:
    if row[3] in agencyList:
        continue
    else:
        agencyList.append(row[3])
​
agencyErrorSets = []
for agency in agencyList:
    subList = []
    sub = {}
    for row in main:
        if row[3] == agency:
            if row[-1] in sub:
                sub[row[-1]] += 1
            else:
                sub[row[-1]] = 1
    subList.append(agency)
    subList.append(sub)
    agencyErrorSets.append(subList)
​
errorTypes = {'Color Contrast Errors':'Color Constrast Error', 'HTML/Attribute Errors':'HTML/Attribute Error', 
'Form Errors':'Form Error', 'Alt Tag Errors':'Alt Tag Error', 'Other Errors':'Other Error'}
​
output = makeAgencyOutput(agencyErrorSets, agencyErrorDict, errorTypes)
finalOutput = {}
finalOutput['data'] = output
​
writeJson(finalOutput, 'agencies.json')