#!/usr/bin/python2.4 # file fa2fa.py # Copyright (C) 2005 Regents of the University of Colorado # Contact: # James Smagala # smagala@colorado.edu # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """Read many .fa files to build one large .fa for use as a local BLAST db Can also exclude seqs. Revision History 050426 James Smagala: Rework of dnd2fa 050627 JS: Complete overhaul """ from optparse import OptionParser from glob import glob from sys import exit class FastaFormatError(Exception): pass def parseFasta (infile): """Convert FASTA files into a comment,sequence pair Requires a file-like object as input, in FASTA form: >comment sequence Yields (comment,sequence) tuples """ # Get the first comment line comment = infile.readline() if comment.startswith('>'): comment = comment[1:].strip() else: raise FastaFormatError, "First character of FASTA file must be '>'" seq = [] def compileSeq(seq): # Ensure that the sequence data exists seq = ''.join(seq) if not seq: raise FastaFormatError, "FASTA record contains no sequence data" return seq for line in infile: # Check for the start of a new record if line.startswith('>'): seq = compileSeq(seq) yield comment,seq seq = [] comment = line[1:].strip() continue # Add the line to the sequence data, removing white space seq.append(''.join(line.split())) seq = compileSeq(seq) yield comment,seq def line_len(line,length=79): """Format long lines to a given length""" lines = [] while line: lines.append(line[:length]) line = line[length:] return '\n'.join(lines) def parseCommandLine(): """Get and check command line options and arguments.""" usage = "usage: %prog [OPTIONS] FILE1 FILE2 FILE3..." version = "%prog 0.2" description = "Compile multiple fasta files into a larger file and/or \ exclude sequences from a fasta file." parser = OptionParser(usage=usage,version=version,description=description) parser.add_option("-e", "--exclude", dest="exfile", metavar="FILE", type="string", help="path to a fasta FILE of sequences to be excluded " "from the output") parser.set_defaults(exfile="") parser.add_option("-o", "--output", dest="outfile", metavar="FILE", type="string", help="path to the output FILE to be created or " "overwritten [default: %default]") parser.set_defaults(outfile="out.fas") options, args = parser.parse_args() # Get rid of any quotes, expand any wildcards infiles = [''.join(file.split('"')) for file in args] if not infiles: parser.error("Missing input file") outfile = ''.join(options.outfile.split('"')) exfile = ''.join(options.exfile.split('"')) return infiles,exfile,outfile def main(): infiles,exfile,outfile = parseCommandLine() # Get the identifiers for the exceptions ex_db = {} if exfile: try: exfile = open(exfile,'r') except IOError, msg: print msg exit(1) for comment,seq in parseFasta(exfile): ex_db[comment] = None exfile.close() # Open the output file for writing try: outfile = open(outfile,'w') except IOError, msg: print msg exit(1) # Loop through all input files for file in infiles: try: infile = open(file,'r') except IOError, msg: print msg continue try: for comment,seq in parseFasta(infile): if not comment in ex_db: print >>outfile, ">%s\n%s" % (comment,line_len(seq)) infile.close() except FastaFormatError, msg: print "%s: %s" % (file,msg) infile.close() # Clean up open files outfile.close() if __name__ == "__main__": main()