#!/usr/bin/env python # file dnd2fa.py # Copyright (C) 2005 Regents of the University of Colorado # Contact: # James Smagala # smagala@colorado.edu # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """Parse .dnd files, get accession numbers, and retrieve fasta records Assumes that all sequences of interest are in a single large fasta file Revision History 050418 James Smagala: Rework of parse_fa 050818 JS: Cleaned up, use optparse, fasta parser copied from confind.py """ from optparse import OptionParser from sys import exit import re class FastaFormatError(Exception): pass def parseFasta (infile): """Convert FASTA files into a comment,sequence pair Requires a file-like object as input, in FASTA form: >comment sequence Yields (comment,sequence) tuples """ # Get the first comment line comment = infile.readline() if comment.startswith('>'): comment = comment[1:].strip() else: raise FastaFormatError, "First character of FASTA file must be '>'" seq = [] def compileSeq(seq): # Ensure that the sequence data exists seq = ''.join(seq) if not seq: raise FastaFormatError, "FASTA record contains no sequence data" return seq for line in infile: # Check for the start of a new record if line.startswith('>'): seq = compileSeq(seq) yield comment,seq seq = [] comment = line[1:].strip() continue # Add the line to the sequence data, removing white space seq.append(''.join(line.split())) seq = compileSeq(seq) yield comment,seq def line_len(line,length=79): """Format long lines to a given length""" lines = [] while line: lines.append(line[:length]) line = line[length:] return '\n'.join(lines) def parseCommandLine(): """Get and check command line options and arguments.""" usage = "usage: %prog [OPTIONS] SOURCE_FASTA_FILE DND_FILE" version = "%prog 0.2" description = "Create a FASTA file containing all sequences with an \ identifier found in both SOURCE_FASTA_FILE and DND_FILE." parser = OptionParser(usage=usage,version=version,description=description) parser.add_option("-o", "--output", dest="outfile", help="output file location [default: %default]") parser.set_defaults(outfile="out.fas") options, args = parser.parse_args() try: dbfile = ''.join(args[0].split('"')) infile = ''.join(args[1].split('"')) except IndexError: parser.error("Missing input file") outfile = ''.join(options.outfile.split('"')) return infile,outfile,dbfile def main(): infile, outfile, dbfile = parseCommandLine() try: infile = open(infile,'r') dbfile = open(dbfile,'r') outfile = open(outfile,'w') except IOError, msg: print msg # parse the fa flat db into a dict seq_db = {} try: for record in parseFasta(dbfile): seq_db[''.join((record[0].split(None,1)[0]).split('_'))] = record except FastaFormatError, msg: print msg exit(1) str = ''.join([''.join(line.split()) for line in infile]) # remove all of the crap from around the accession numbers str = ' '.join(str.split('(')) str = ' '.join(str.split(',')) str = ' '.join(str.split(';')) str = ' '.join(str.split("'")) # Get rid of any distances dist = re.compile(r':.*?\.\d*') str = dist.sub(' ',str) # Get rid of any labels label = re.compile(r'\s*\).*?[$\s]+') str = label.sub(' ',str) # split on white space to get a list of accessions and distances # compensate for _ problems with CDC fasta comments li = str.split() li = [''.join(element.split('_')) for element in li] for acc in li: if acc in seq_db: # print the data in fasta format record = seq_db[acc] print >>outfile, ">%s\n%s" % (record[0],line_len(record[1])) else: print "Accession number '%s' is not in database." % acc # Clean up open files dbfile.close() infile.close() outfile.close() if __name__ == "__main__": main()