#!/usr/bin/env python
#auth:Aaron Elmore
#Read in csv's for a given directory and output the headers to a headers.csv

import os

def trim(word):
    return word.strip().replace("\n","").replace("\"","").lower()

#get relative directory and give a slight warning
print ""
print "" 
print "*"*100
print "*********** Warning will overwrite a headers.csv in the directory entered  *************************"
print "*********** Headers will be case folded. Outside whitespace and \" chars will be trimmed ************"
print "*"*100

dr= raw_input("Directory with CSVs relative to here (no leading slash), enter for this directory, \n or absolute (leading slash ie /home/user/test):")
if dr.find("/")==0:
    nd = dr
else:
    cd = os.getcwd()
    nd=cd+"/"+dr
dirList=os.listdir(nd)

dr = raw_input("Enter the single delimiter (default is ,). Type 'tab' for \\t :")
if dr:
    if dr =="tab" or dr =="'tab'":
        delim="\t"
    else:
        delim = str(dr)
else:
    delim = ","
    
headers = dict()
colSize = list()
fileNames = list()

#read in the headers
for file in dirList:
    dfile=nd+"/"+file
    #print dfile
    if dfile.endswith(".csv") and not dfile.endswith("headers.csv"):
        f= open(dfile,"r")
        f.seek(0)
        headerline=str(f.readline())
        #print headerline
        f.close
        cols = headerline.split(delim)
        cols = map(trim,cols)
        cols.sort()
        colSize.append(len(cols))
        headers[file]=cols
        fileNames.append(file)
        

if len(fileNames)==0:
    print "No CSVs found in:",nd
    exit()
else:
    print len(fileNames), " files checked in ", nd," using the delimiter: ",delim
    print "Files: ",
    for file in fileNames:
        print file,",",
    print ""
    print "*"*100


#whats the largest list of headers
maxCol = max(colSize)
numFiles = len(headers.keys())

wordCount = dict()

for k in headers.iterkeys():
    for wrd in headers[k]:
        if wrd in wordCount:
            wordCount[wrd]+=1
        else:
            wordCount[wrd]=1
            

#count the occurences of terms

#quick, should check for file first
outfile = nd+"/headers.csv"
fo = open(outfile,"w")


#build lists based on # of files the word appears in
counts = set(wordCount.values())

wordCountList=dict()
for x in counts:
    wordCountList[x] = list()

for wrds in wordCount.iterkeys():
    wordCountList[wordCount[wrds]].append(wrds)
        
#sort in reverse order of counts
counts = list(counts)   
counts.reverse()


#write headers for files
for file in fileNames:
    fo.write(file+",")
    print file,",",
    
fo.write("\n")
print ""


#words in order of appearance then by alpha
for x in (counts):
    wordCountList[x].sort()
    for word in wordCountList[x]:
        for file in fileNames:
            if word in headers[file]:
                print word,",",
                fo.write(word+",")
            else:
                print ",",
                fo.write(",")
        print ""
        fo.write("\n")


fo.close()