scripts/sort_fasta.py at master · mcveigh-h16/scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 10 10:45:44 2020

@author: mcveigh
"""

#
# script to selectively remove sequences from a fasta file based on an accession list
# default accession list is ITS_reject_seqs3.txt
# output file names are hard coded and will overwrite any existing file with the same name.
# accession match works for accession and accession.version. The vesion number is ignored for the match

import pandas as pd
import Bio
import os
import sys

inputfile = sys.argv[1]
#outputfile = sys.argv[2]

#Read in the reject list and find the accession
reject_file_name = (r'ITS_reject_seqs3.txt')
#reject_file_name = (r'/panfs/pan1.be-md.ncbi.nlm.nih.gov/dnaorg/ITS/ITS_reject_seqs')
rejectlist_df = pd.read_csv(reject_file_name, sep='\t', index_col=None, low_memory=False, header=None, names=["accession", "type", "reason"])
rejectlist = rejectlist_df['accession']
reject_list = set(rejectlist_df['accession'].tolist())

#Parse the GenBank file and remove any sequences found on the reject list
from Bio import SeqIO
sequences = []
found = []
#missingRNA = []


for seq_record in SeqIO.parse(inputfile, "fasta"):
    str_id = seq_record.id
    #print(seq_record.id)
    if str_id.find('.') != -1:
        str_id = str_id[:str_id.find('.')]
    if str_id not in reject_list:
        sequences.append(seq_record)
    else:
        found.append(seq_record)
        print("I found this accession on the reject list and wrote the sequence to found.fsa: ", seq_record.id)
SeqIO.write(sequences, "stripped.fsa", "fasta")
SeqIO.write(found, "found.fsa", "fasta")