#!/usr/bin/python # # x2x version 2. Support seqcluster and sortx # April 20, 2009 CER # version 4 Write GROUP and ENV files # April 30, 2009 CER # version 5 write EXP files instead of ENV files # Second_Column_Env = 'one_experiment' import sys,re,os #process the input parameters params = sys.argv if len(params) == 2: output_file_name = params[1] else: print("x2x error: Usage: x2x ") sys.exit() first = 1 #flag dict = {} #the table of data i = 0 #count the attributes attr = [] sortxrun = 1 #Assume we will find a sortx run filenames = os.listdir(os.curdir) #get all of the file names files_grp = [] for files in filenames: test = re.findall(r'.+grp$',files) #pick out the *.grp files if test: files_grp.append(files) if not files_grp: #if we did not find any .grp files from sortx, look for .csv files from seqcluster for files in filenames: test = re.findall(r'.+csv$',files) #pick out the *.csv files if test: files_grp.append(files) sortxrun = 0 if not files_grp: #if we did not find any .grp or .csv files, then abandon all hope... print print("x2x error: Did not find .grp (sortx) or .csv (SeqclusterX) files in the local directory.") print sys.exit() # # # # build annotation file to move sortx data to XplorSeq # # #build the output file name group_file_name = output_file_name + '_x2xGROUP.csv' env_file_name = output_file_name + '_x2xEXP.csv' output = file(group_file_name,"w") #open the GROUP file which will get everything outenv = file(env_file_name,"w") #open the EXP file which will only get the column headings and sequence names for allfiles in files_grp: input = file(allfiles,'r') #point to current input file #Decode cluster type from file name if sortxrun: #Parse sortx .grp file names here clustype = re.findall(r'_(M\d+|F\d+|N\d+).grp$',allfiles) else: #Parse seqcluster .csv file names here clustype = re.findall(r'_(MAX\w+|MIN\w+|AVG\w+).csv$',allfiles) myattr = 'clus_' + clustype[0] #Pre-pend clus_ to cluster type attr.append(myattr) #save the attribute name line = input.readline() #read the 2nd line of the file #now start the loops while line: tokens = re.split(r'( |\t)',line) #get the key and the attribute value key = re.sub(' ','',tokens[0]) #strip off any spaces attrval= re.sub(' ','',tokens[len(tokens)-1]) #strip off any spaces attrval= re.sub('\r','',attrval) #strip end of line chars attrval= re.sub('\n','',attrval) #strip end of line chars if first: dict[key]= {} #make a dictonary for each key value # print key + " " + attrval dict[key][attr[i]] = attrval #put the attribute value in the table line = input.readline() #get the next line of the file i = i + 1 #count the attribute input.close() first = 0 #Now write out what was written to debug this print "-------------------------------------" print " " print " " print " " attr.sort() temp = "Key" outenv.write(temp + '\t' + 'Experiment0' + '\t\n') #write the column header to the EXP file for allattr in attr: temp = temp + '\t' + allattr print temp temp = temp + '\n' output.write(temp) #Write all of the column headers to the GROUP file key_list = dict.keys() key_list.sort() for allkeys in key_list: temp = allkeys outenv.write(temp + '\t'+ Second_Column_Env + '\t\n') #write the sequence name to the EXP file for allattr in attr: temp = temp + '\t' + dict[allkeys][allattr] print temp temp = temp + '\n' output.write(temp) #Write all of the stat values for this sequence name output.close() #close the GROUP file outenv.close() #close the EXP file