#!/usr/bin/python # # # April 25, 2009 CER # Use output of x2x and (possible) input from user to build input files for biodiv # Input to biodivprep: x2x_GROUP.csv from x2x # x2x_EXP.csv from x2x # Note: both of these files may have been edited (via spreadsheet) by the user before biodivprep is run # The program may have up to four input parameter: # # biodivprep -g _x2xGROUP.csv -e _x2xEXP.csv -gc -ec # # biodivprep_version = 0.75 import sys,re,os,time #process the input parameters params = sys.argv #Get the command line parameters group_file_name = [] env_file_name = [] group_column_name = '' env_column_name = '' group_file_found = 0 env_file_found = 0 paramlen = len(params) if paramlen == 1: #No parameters specified... issue message and bail print "biodivprep Version %6.2F" % (biodivprep_version) print print "biodivprep arguments:" print print " biodivprep -g _x2xGROUP.csv -e _x2xEXP.csv" print " -gc " print " -ge " print " -h " print sys.exit() #print params # # Parse the command line parameters # params.remove(params[0]) #Clip off the program name parameter #print params ERR = 0 #Reset the error flag while len(params) > 0: param_name = params[0] params.remove(param_name) #Clip off of the parameter paramlen = len(params) #Get the length of the clipped list if param_name == '-g': if group_file_name: print print "Error: -g parameter found twice in the parameter list" print ERR = 1 break if paramlen < 1: print print "Error: Missing *_x2xGROUP.csv file name after -g" print ERR = 1 break else: group_file_name = params[0] #Get the next parameter params.remove(group_file_name) #Clip off the group file name paramlen = len(params) #Update the number of parameters left first = group_file_name[:1] #Get the first char of the -g parameter if first == '-': #The file name should not start with a dash print print "Error: Missing file name following -g: -g %s" % (group_file_name) print ERR = 1 break TEST = group_file_name.find('_x2xGROUP.csv') # print "group_file_name: %s. TEST: %d" % (group_file_name,TEST) if TEST == -1: print print "Error: Missing *_x2xGROUP.csv file name after -g" print ERR = 1 break elif param_name == '-e': if env_file_name: print print "Error: -e parameter found twice in the parameter list" print ERR = 1 break if paramlen < 1: print print "Error: Missing *_x2xEXP.csv file name after -e" print ERR = 1 break else: env_file_name = params[0] params.remove(env_file_name) #Clip off the environment file name paramlen = len(params) #Update number of params to process first = env_file_name[:1] #Get the first char of the -e parameter if first == '-': print print "Error: Missing file name following -e: -e %s" % (env_file_name) print ERR = 1 break TEST = env_file_name.find('_x2xEXP.csv') # print "env_file_name: %s TEST: %d" % (env_file_name,TEST) if TEST == -1: print print "Error: Missing *_x2xEXP.csv file name after -e" print ERR = 1 break elif param_name == '-gc': if group_column_name: print print "Error: -gc parameter found twice in the parameter list" print ERR = 1 break if paramlen < 1: print print "Error: Missing group column name after -gc" print ERR = 1 break group_column_name = params[0] first = group_column_name[:1] if first == '-': print print "Error: Missing column name following -gc: -gc %s" % (group_column_name) print ERR=1 break params.remove(group_column_name) paramlen = len(params) elif param_name == '-ec': if env_column_name: print print "Error: -ec parameter found twice in the parameter list" print ERR = 1 break if paramlen < 1: print print "Error: Missing EXPeriment column name after -ec" print ERR = 1 break env_column_name = params[0] first = env_column_name[:1] if first == '-': print print "Error: Missing EXPeriment column name following -ec: -ec %s" % (env_column_name) print ERR = 1 break params.remove(env_column_name) paramlen = len(params) elif param_name == '-h': print ERR = 1 break else: print print "Error: Unrecognized input parameter name: %s" % (param_name) print ERR = 1 break if ERR: print "biodivprep Version %5.1F" % (biodivprep_version) print print "biodivprep arguments:" print print " biodivprep -g _x2xGROUP.csv -e _x2xEXP.csv" print " -gc " print " -ge " print " -h " print sys.exit() if not group_file_name: print print "Error: You must provide the *_x2x_GROUP.csv file name, e.g., -g _x2xGROUP.csv" print ERR = 1 if not env_file_name: print print "Error: You must provide the *_x2x_ENV.csv filename, e.g., -e _x2xEXP.csv" print ERR = 1 if group_file_name == env_file_name: print print "Error: The -g parameter and the -e parameter are identical. These must be separate files." print ERR = 1 if ERR: print "biodivprep Version %5.1F" % (biodivprep_version) print print "biodivprep arguments:" print print " biodivprep -g _x2xGROUP.csv -e _x2xEXP.csv" print " -gc " print " -ge " print " -h " print sys.exit() # # Now go look for the file specified by the command line parameters # filenames = os.listdir(os.curdir) #get all of the file names #print filenames for afile in filenames: if afile == group_file_name: group_file_found = 1 input_group = file(group_file_name,"r") #open the GROUP file elif afile == env_file_name: env_file_found = 1 input_env = file(env_file_name,"r") #opent the ENV file if not group_file_found: print print "Error: GROUP file specified, %s, not found in local directory" % (group_file_name) print sys.exit() if not env_file_found: print print "Error: EXP file specified, %s, not found in local directory" % (env_file_name) print sys.exit() info_output_file_name = group_file_name.replace('x2xGROUP.csv','bdp_info.txt',1) info_output = file(info_output_file_name,'w') #Set up the info file print b = time.ctime(time.time()) + ' ' a = "biodivprep Version %5.1F " % (biodivprep_version) print a + b print info_output.write(a + b + '\n\n') a = " GROUP file name: %s" % (group_file_name) print a info_output.write(a + '\n') a = " EXP file name: %s" % (env_file_name) print a info_output.write(a + '\n') a = " Group column parameter: %s" % (group_column_name) print a info_output.write(a + '\n') a = " EXP column parameter: %s" % (env_column_name) print a print info_output.write(a + '\n\n') # # Now create the output file names of the form _x2x_bioprepdGROUP.csv and _x2x_bioprepredEXP.csv # # ***** come back to setting up the output file names # # Read in the GROUP file # Start by reading the header line # line = input_group.readline() #Read the GROUP file header row tokens = re.split(r'(>|\t|\n|\r|)',line) #Split the GROUP header line on tabs, line-feeds, and returns tab_count = tokens.count('\t') i = 0 while i < tab_count: tokens.remove('\t') i = i + 1 line_feed_count = tokens.count('\n') i = 0 while i < line_feed_count: tokens.remove('\n') i = i + 1 null_count = tokens.count('') i = 0 while i < null_count: tokens.remove('') i = i + 1 return_count = tokens.count('\r') i = 0 while i < return_count: tokens.remove('\r') i = i + 1 tokens.remove(tokens[0]) #Delete the ID name from the first element in the list group_column_names = tokens #save the column names to use as indices group_column_count = len(group_column_names) #Grab the number of group columns found in the group file header a = "Number of columns in the group file: %d" % (group_column_count) print a info_output.write(a + '\n') if group_column_name: if group_column_name not in group_column_names: print info_output.write('\n') a = "Error: Requested -gc name, %s, not found. Allowed group columns are:" % (group_column_name) print a info_output.write(a + '\n') print group_column_names for some_col_name in group_column_names: info_output.write(some_col_name + ' ') info_output.write('\n') print info_output.write('\n') sys.exit() line = input_group.readline() #Get a line from the GROUP file. group_dict = {} #set up the group dictionary seq_name_count = 0 while line: seq_name_count = seq_name_count + 1 tokens = re.split(r'(>|\t|\n|\r|)',line) #Split the data line on tabs, line-feeds, and returns tab_count = tokens.count('\t') i = 0 while i < tab_count: #Nuc all of the tabs tokens.remove('\t') i = i + 1 line_feed_count = tokens.count('\n') i = 0 while i < line_feed_count: #Nuc all of the line feeds tokens.remove('\n') i = i + 1 null_count = tokens.count('') i = 0 while i < null_count: #Nuc all of the nulls tokens.remove('') i = i + 1 i = 0 while i < return_count: #Nuc all of the returns tokens.remove('\r') i = i + 1 sequence_name = tokens[0] tokens.remove(tokens[0]) sequence_data_count = len(tokens) if sequence_data_count != group_column_count: print info_output.write('\n') a = "Error while parsing GROUP file at sequence %s; Header column count %d differs from Data column count %d" % (sequence_name,group_column_count,sequence_data_count) print a info_output.write(a + '\n') print info_output.write('\n') sys.exit() n = 0 #Counter to point to the columns group_dict[sequence_name] = {} #Set up the dictionary for the columns for this sequence name for a_col_name in group_column_names: group_dict[sequence_name][a_col_name] = tokens[n] n = n + 1 line = input_group.readline() #Read another row in the GROUP file and loop a = "Total sequences found in GROUP file: %d; Last sequence_name: %s" % (seq_name_count,sequence_name) print a info_output.write(a + '\n') # # *** Now read the EXPeriment file *** # line = input_env.readline() #Read the EXPeriment file headers # #First process the header row tokens = re.split(r'(>|\t|\n|\r|)',line) #Split the line on tabs, line-feeds, and returns tab_count = tokens.count('\t') i = 0 while i < tab_count: #Nuke the tabs tokens.remove('\t') i = i + 1 line_feed_count = tokens.count('\n') i = 0 while i < line_feed_count: #Nuke the line feeds tokens.remove('\n') i = i + 1 null_count = tokens.count('') i = 0 while i < null_count: tokens.remove('') i = i + 1 return_count = tokens.count('\r') i = 0 while i < return_count: #Nuke the returns tokens.remove('\r') i = i + 1 tokens.remove(tokens[0]) #Delete the ID name from the first element in the list env_column_names = tokens #save the column names to use as indices env_column_count = len(env_column_names) #Save the number of columns found a = "Number of columns in the EXPeriment file: %d" % (env_column_count) print a info_output.write(a + '\n') if env_column_name: if env_column_name not in env_column_names: print info_output.write('\n') a = "Error: Requested -ec name, %s, not found. Allowed EXPerimnet columns are:" % (env_column_name) print a info_output.write(a + '\n') print env_column_names for some_col_name in env_column_names: info_output.write(some_col_name + ' ') info_output.write('\n') print info_output.write('\n') sys.exit() line = input_env.readline() #Get the first line of data with sequence name env_dict = {} #set up the EXPeriment dictionary (represents raw data for whole file) env_count_dict = {} #Make a place to keep track of the number of experiments found per column for a_col_name in env_column_names: #scan through all of the column names and make a place to save what we find env_count_dict[a_col_name] = {} Env_seq_name_count = 0 #The number of sequence names we found (aka row count) # Now process the sequence named rows while line: Env_seq_name_count = Env_seq_name_count + 1 #Count this row tokens = re.split(r'(>|\t|\n|\r|)',line) #Split the data line on tabs, line-feeds, and returns tab_count = tokens.count('\t') i = 0 while i < tab_count: #Nuc all of the tabs tokens.remove('\t') i = i + 1 line_feed_count = tokens.count('\n') i = 0 while i < line_feed_count: #Nuc all of the line feeds tokens.remove('\n') i = i + 1 null_count = tokens.count('') i = 0 while i < null_count: #Nuc all of the nulls tokens.remove('') i = i + 1 i = 0 while i < return_count: #Nuc all of the returns tokens.remove('\r') i = i + 1 sequence_name = tokens[0] #Get the sequence name for this row tokens.remove(tokens[0]) #Clip the sequence name off of the head of the list sequence_data_count = len(tokens) #Grab the number of columns found for this particular row if sequence_data_count != env_column_count: print info_output.write('\n') a = "Error while parsing EXPeriment file at sequence %s; Header column count %d differs from Data column count %d" % (sequence_name,env_column_count,sequence_data_count) print a info_output.write(a + '\n') print info_output.write('\n') sys.exit() n = 0 #Counter to point to the columns in the tokens buffer env_dict[sequence_name] = {} #Set up the dictionary for the columns for this sequence names row for a_col_name in env_column_names: #Scan across the columns for this row by the names found in the header row env_dict[sequence_name][a_col_name] = tokens[n] #Put the value for this column (a_col_name and n) at this row (sequence_name) in the doubly indexed dictionary env_key_list = env_count_dict[a_col_name].keys() #Get the list of all of the experiments found so far for this column if tokens[n] not in env_key_list: #See if we have seen this experiment before in this column, if not found, then add it. env_count_dict[a_col_name][tokens[n]] = 1 #Initialize this first count to 1 else: #Ok... we have seen this experiment before in this columne, so just increment the counter ii = env_count_dict[a_col_name][tokens[n]] #Get the current count value ii = ii + 1 #Increment the count env_count_dict[a_col_name][tokens[n]] = ii #Put the count back... n = n + 1 #Increment the token counter to the next column line = input_env.readline() #Get the next row in the EXPeriment file a = "Total sequences found in EXPeriment file: %d; Last sequence_name: %s" % (Env_seq_name_count,sequence_name) print a info_output.write(a + '\n') print info_output.write('\n') a = "The EXPeriment file contains %d sequence names and %d EXPeriment column(s) as follows:" % (Env_seq_name_count,env_column_count) print a info_output.write(a + '\n') i = 0 for a_col_name in env_column_names: #Go through all of the columns in the EXPeriment file i = i + 1 a = "Column %3d: %s" % (i,a_col_name) print a info_output.write(a + '\n') env_key_list = env_count_dict[a_col_name].keys() #Get the list of values found in this column for a_key in env_key_list: #Go through all of the values found in this column a = " %15s: %5d" % (a_key,env_count_dict[a_col_name][a_key]) #Print the values and the number of times the value was found print a info_output.write(a + '\n') print info_output.write('\n') # # Print out the stats for the experiment file here. Number of columns, name of columns, name and number of experiments found per column # # Now see if all of the sequence names are in both the GROUP and EXPeriment files. # group_file_seq_name_list = group_dict.keys() env_file_seq_name_list = env_dict.keys() NOT_OK = 0 for a_group_sequence in group_file_seq_name_list: if a_group_sequence not in env_file_seq_name_list: NOT_OK = 1 #Set the error found flag a = "GROUP Sequence name: %20s not in the EXPeriment file." % (a_group_sequence) print a info_output.write(a + '\n') for a_env_sequence in env_file_seq_name_list: if a_env_sequence not in group_file_seq_name_list: NOT_OK = 1 #Set the error found flag a = "EXP Sequence name %20s not in the GROUP file." % (a_env_sequence) print a info_output.write(a + '\n') if NOT_OK: print info_output.write('\t') a = "Error: See above list of sequence names. GROUP and EXPeriment files must contain exactly the same sequence names." print a info_output.write(a + '\n') print info_output.write('\n') sys.exit() else: print info_output.write('\n') a = "The GROUP and EXPeriment files contain identical sets of sequence names. (Required)" print a info_output.write(a + '\n') print info_output.write('\t') # # Time to write out the "new" GROUP and EXPeriment files. # # first the group file: # Let's open the new GROUP file for writing... first must construct the new name from the old name. # New_group_file_name = group_file_name.replace('x2xGROUP','bdpGROUP',1) #Make the new GROUP file name Prefix = "AllGROUPS_" if group_column_name: Prefix = group_column_name + '_' New_group_file_name = Prefix + New_group_file_name output_group = file(New_group_file_name,"w") #Open the new GROUP file for writing New_env_file_name = env_file_name.replace('x2xEXP','bdpEXP',1) #Make the new EXPeriment file name Prefix = "ALLEXPeriments_" if env_column_name: Prefix = env_column_name + '_' New_env_file_name = Prefix + New_env_file_name output_env = file(New_env_file_name,"w") #Open the new EXPeriment file for writing group_file_seq_name_list.sort() #Sort all of the sequence names... will be used for new GROUP and EXP files if group_column_name: group_column_names = [group_column_name] #If only one GROUP column specified, then truncate the list this_group_column_count = "with the single column %s" % (group_column_name) else: this_group_column_count = "all %3d columns" % (len(group_column_names)) if env_column_name: env_column_names = [env_column_name] #If only one EXPeriment columne specified, then truncate the list this_env_column_count = "with the single column %s" % (env_column_name) else: this_env_column_count = "all %3d columns" % (len(env_column_names)) group_buffer = 'Sequence_name' #Set up a place to build an output row for a_group_column_name in group_column_names: #Go through all of the columns in the row group_buffer = group_buffer + '\t' + a_group_column_name #build the header for the GROUP file group_buffer = group_buffer + '\n' #Terminate the line with a line feed output_group.write(group_buffer) #Write the GROUP file header env_buffer = 'Sequence_name' #Set up the place to assemble the EXP header for a_env_column_name in env_column_names: #Go though all of the columns in the EXP file header env_buffer = env_buffer + '\t' + a_env_column_name #build the header for the EXP file env_buffer = env_buffer + '\n' #Terminate the line output_env.write(env_buffer) #Write the EXP file header seq_count = 0 #Set up a counter for the sequence names written for a_sequence_name in group_file_seq_name_list: #Run through all of the sequence names found in both files seq_count = seq_count + 1 #Count this sequence name group_buffer = a_sequence_name #Get a sequence name and add it to the buffer for a_group_column_name in group_column_names: #Go through all of the columns in this row of the GROUP file group_buffer = group_buffer + '\t' + group_dict[a_sequence_name][a_group_column_name] #Add the values for this row group_buffer = group_buffer + '\n' #Terminate the line output_group.write(group_buffer) #Write a GROUP file row env_buffer = a_sequence_name #Get a sequence name and add it to the EXP file buffer for a_env_column_name in env_column_names: #Go though all of the columns in this row of the EXP file env_buffer = env_buffer + '\t' + env_dict[a_sequence_name][a_env_column_name] #Add the values for this row env_buffer = env_buffer + '\n' #Terminate the line output_env.write(env_buffer) #Write a EXP file row # # All done... write the finishing messages # print info_output.write('\n') a = "New GROUP file written: %30s with %s" % (New_group_file_name,this_group_column_count) print a info_output.write(a + '\n') a = "New EXPeriment file written: %30s with %s" % (New_env_file_name,this_env_column_count) print a info_output.write(a + '\n') print info_output.write('\n') a = "Number of sequence names written to both files: %d" % (seq_count) print a info_output.write(a + '\n') print info_output.write('\n') #Close all of the files we used output_group.close() output_env.close() input_group.close() input_env.close() info_output.close() sys.exit()