adamhockenberry / bacphlip
1 1
import pkg_resources
2 1
from Bio import SeqIO, SearchIO
3 1
import os
4 1
import subprocess
5 1
import pandas as pd
6 1
from collections import OrderedDict
7 1
import joblib
8 1
import sys
9 1
import shutil
10 1
import glob
11

12
"""
13
TODO:
14
    Check that input fastas are DNA and output is amino acids
15
    Check for suspiciously short/long genome lengths
16
    Allow for batch input (one fasta per line)
17
    Allow for existing amino acid file input (rather than 6frame)
18
    Allow for custom hmmsearch
19
"""
20

21 1
SKLEARN_CLASSIFIER = pkg_resources.resource_filename('bacphlip', 'data/rf_best.joblib')
22 1
HMMER_DB = pkg_resources.resource_filename('bacphlip', 'data/prot_models.hmm')
23

24 1
TOTAL_PROTEIN_MODELS=206
25 1
MIN_PROT_LENGTH=40
26

27 1
def check_existing_file(infile):
28
    """
29
    Checks whether a file does not exist and provides a hopefully informative error message
30
    if that's not the case.
31

32
    Inputs:
33
        infile - path to a hopefully existing file
34
    """
35 1
    if not os.path.exists(infile):
36 1
        raise Exception("Input file ({}) does not appear to exist.".format(infile))
37

38 1
def check_nonexisting_path(outfile):
39
    """
40
    Checks whether a file to be written already  exists and provides an informative message if
41
    that check fails.
42
    
43
    Inputs:
44
        outfile - path to a hopefully non-existent file file
45
    """ 
46 1
    if os.path.exists(outfile):
47 1
        raise Exception("Specified output file ({}) appears to already exist. " 
48
                "Remove before running program again or specify force_overwrite flag ('-f').".format(outfile))
49

50 1
def check_genome_fasta_reqs(nt_records):
51
    """
52
    Checks whether a fasta file (meant to be a genome (i.e. DNA) file) can be read and ensures that it only
53
    contains one record.
54
    
55
    Another check to implement is that the fasta record is of DNA type.
56

57
    Inputs:
58
        nt_records - list from SeqIO.parse that ideally contains one element.
59
    """
60 1
    if len(nt_records) == 1:
61 1
        pass
62 1
    elif len(nt_records) == 0:
63 1
        raise Exception("Input fasta file appears to be empty. "
64
                "Ensure that the file contains what you think it does.")
65
    else:
66 1
        raise Exception("Input fasta file appears to contain more than one sequence record. "
67
                "Please run bacphlip with the '--multi_fasta' flag if your file contains "
68
                "more than one genome.")
69 1
    return
70

71 1
def check_genome_multifasta_reqs(nt_records):
72
    """
73
    Checks whether a fasta file (meant to be a genome (i.e. DNA) file) can be read and ensures that it contains
74
    more than one record.
75
    
76
    Another check to implement is that the fasta record is of DNA type.
77

78
    Inputs:
79
        nt_records - list from SeqIO.parse that ideally contains more than one element
80
    """
81 0
    if len(nt_records) == 1:
82 0
        raise Exception("Input fasta file appears to contain a single sequence record."
83
                "Please re-run analysis without the '--multi' flag.")
84 0
    elif len(nt_records) == 0:
85 0
        raise Exception("Input fasta file appears to be empty. "
86
                "Ensure that the file contains what you think it does.")
87
    else:
88
        pass
89 0
    return
90

91 1
def check_record_id_uniqueness(nt_records):
92
    """
93
    For multi-fasta inputs, this makes sure that the records are uniquely named since they
94
    will be used to name files that we do not want to be over-written.
95

96
    Inputs:
97
        nt_records - list from SeqIO.parse that ideally contains more than one element
98
    """
99 0
    record_ids = [record.id for record in nt_records]
100 0
    n_record_ids = len(record_ids)
101 0
    unique_n_record_ids = len(list(set(record_ids)))
102 0
    if n_record_ids != unique_n_record_ids:
103 0
        raise Exception("Input multi-fasta file appears to contain more than one record with "
104
                "the same id (as parsed by biopython 'record.id'. BACPHLIP uses this id to "
105
                "write temporary files and will thus error. Please ensure that each record in "
106
                "the multi-fasta file is uniquely named before proceeding. Exiting.") 
107
    
108 0
    if len(nt_records) == 1:
109 0
        raise Exception("Input fasta file appears to contain a single sequence record."
110
                "Please re-run analysis without the '--multi' flag.")
111 0
    elif len(nt_records) == 0:
112 0
        raise Exception("Input fasta file appears to be empty. "
113
                "Ensure that the file contains what you think it does.")
114
    else:
115
        pass
116 0
    return
117

118 1
def six_frame_translate(fasta_file_path, output_file_path, multi_fasta=False, force_overwrite=False):
119
    """
120
    Reads a genome fasta file and outputs either an amino acid fasta file containing all possible
121
    six frame translational reading frames that are longer than a set length threshold OR (multi_fasta option)
122
    a directory full of fasta files that each contains the six frame translational reading frames for a 
123
    given multi-fasta.
124

125
    Inputs:
126
        fasta_file_path - system path to a valid genome (nucleotide) fasta file
127
        output_file_path - system path for writing the amino acid file
128
        multi_fasta - if True will create a directory and write numerous output files into this
129
                        directory (one for each record in the fasta file)
130
        force_overwrite - if True will write "output_file_path" regardless of whether it exists
131
    """
132 1
    check_existing_file(fasta_file_path)
133
    ###Read in file    
134 1
    nt_records = list(SeqIO.parse(fasta_file_path, format='fasta'))
135
    
136 1
    if not force_overwrite:
137 1
        check_nonexisting_path(output_file_path)
138
    
139 1
    if multi_fasta:
140 0
        check_genome_multifasta_reqs(nt_records)
141 0
        check_record_id_uniqueness(nt_records)
142 0
        if os.path.exists(output_file_path):
143 0
            if force_overwrite:
144 0
                shutil.rmtree(output_file_path)
145 0
        os.mkdir(output_file_path)
146
    
147
    else:
148 1
        check_genome_fasta_reqs(nt_records)
149
    
150
    ###Run basic code
151 1
    for nt_record in nt_records:
152 1
        genome_id = nt_record.id
153 1
        prots = []
154 1
        for i in [0, 1, 2]:###Each of the three reading frames
155 1
            modulo = len(nt_record[i:])%3
156 1
            assert modulo < 3
157
            ###Translate the regular strand
158 1
            if modulo > 0:
159 1
                tempy = nt_record[i:-modulo].translate()
160
            else:
161 1
                tempy = nt_record[i:].translate()
162
            ###Split according to stop codons
163 1
            seq = str(tempy.seq).split('*')
164
            ###Append the sequences if they are longer than the defined minimum
165 1
            for j in seq:
166 1
                if len(j) >= MIN_PROT_LENGTH:
167 1
                    prots.append(j)
168
            ###Repeat for the reverse complement
169 1
            if modulo > 0:
170 1
                tempy = nt_record.reverse_complement()[i:-modulo].translate()
171
            else:
172 1
                tempy = nt_record.reverse_complement()[i:].translate()
173 1
            seq = str(tempy.seq).split('*')
174 1
            for j in seq:
175 1
                if len(j) >= MIN_PROT_LENGTH:
176 1
                    prots.append(j)
177 1
        if multi_fasta:
178 0
            ind_outfile_path = output_file_path + '{}.6frame'.format(genome_id)
179 0
            try:
180 0
                check_nonexisting_path(ind_outfile_path)
181 0
            except:
182 0
                raise Exception("Specified output file ({}) appears to already exist. Exiting.".format(ind_outfile_path))
183

184 0
            with open(ind_outfile_path, 'w') as outfile:
185 0
                for i, seq in enumerate(prots):
186 0
                    outfile.write('>{}_{}\n{}\n'.format(genome_id, i, seq))
187
        else:
188 1
            with open(output_file_path, 'w') as outfile:
189 1
                for i, seq in enumerate(prots):
190 1
                    outfile.write('>{}_{}\n{}\n'.format(genome_id, i, seq))
191 1
            break
192 1
    return    
193

194 1
def hmmsearch_py(aa_fasta_file, hmmsearch_out, force_overwrite=False, local_hmmsearch=False):
195
    """
196
    Runs hmmsearch on an amino acid fasta file against a pre-compiled database of protein domains
197
    of interest. 
198

199
    Inputs:
200
        aa_fasta_file - system path to a valid amino acid fasta file
201
        hmmsearch_out - system path for writing the hmmsearch output file
202
        force_overwrite - if True will write "hmmsearch_out" regardless of whether it exists
203
    """
204

205 1
    check_existing_file(aa_fasta_file)
206 1
    if not force_overwrite:
207 1
        check_nonexisting_path(hmmsearch_out)
208
    #
209 1
    hmmsearch_call = "hmmsearch"
210 1
    if local_hmmsearch:
211 1
        if not os.path.exists(local_hmmsearch):
212 1
            raise Exception("You specified a local hmmsearch install (path={}) but this file does not appear to exist.".format(local_hmmsearch))
213 0
        hmmsearch_call = local_hmmsearch
214
    #
215 1
    with open(hmmsearch_out, 'w') as outfile:
216 1
        try:
217 1
            subprocess.run(args=[hmmsearch_call, HMMER_DB, aa_fasta_file], check=True, stdout=outfile)
218 0
        except subprocess.CalledProcessError as e:
219 0
            raise Exception("System call to outside program \"hmmsearch\" failed. Please ensure that you have hmmsearch 3.x "\
220
            "or greater properly installed. BACPHLIP assumes by default that the install is available in the system path but "\
221
            "local installs can be supplied with the \"--local_hmmsearch\" flag followed by the path to a local hmmsearch.") from e
222 1
    return
223

224 1
def process_hmmsearch(hmmsearch_file, hmmsearch_df_out, force_overwrite=False):
225
    """
226
    Processes the hmmsearch default output file into a pandas dataframe -> tab-seperated file
227
    describing the presence/absence of each protein in the hmmsearch file.
228

229
    Inputs:
230
        hmmsearch_file - system path to a valid hmmsearch output file
231
        hmmsearch_df_out - system path for writing the processed tsv file
232
        force_overwrite - if True will write "hmmsearch_df_out" regardless of whether it exists
233
    """
234 1
    check_existing_file(hmmsearch_file)
235 1
    if not force_overwrite:
236 1
        check_nonexisting_path(hmmsearch_df_out)
237 1
    with open(hmmsearch_file, 'r') as infile:
238 1
        results = list(SearchIO.parse(infile, 'hmmer3-text'))
239 1
        simple_res = []
240 1
        for i in results:
241 1
            if len(i.hits) > 0:
242 1
                simple_res.append((i.id, 1))
243
            else:
244 1
                simple_res.append((i.id, 0))
245 1
    if len(simple_res) != TOTAL_PROTEIN_MODELS:
246 0
        raise Exception('Appears to be an error, too many or too few results given expected number of protein models tested.')
247 1
    simple_res = sorted(simple_res, key=lambda x: x[0])
248 1
    single_df = pd.DataFrame(OrderedDict(simple_res), index=[0])
249 1
    single_df.to_csv(hmmsearch_df_out, sep='\t')
250 1
    return
251

252 1
def predict_lifestyle(hmmsearch_df, predictions_out, force_overwrite=False):
253
    """
254
    Predicts the lifestyle of a phage given the location to a table describing the presence/absence of a set of
255
    protein domains and a pre-trained scikit-learn classifier.
256

257
    Inputs:
258
        hmmsearch_df - system path to a valid tsv file
259
        predictions_out - system path for writing the final predictions file
260
        force_overwrite - if True will write "hmmsearch_df_out" regardless of whether it exists
261
    """
262 1
    if not force_overwrite:
263 1
        check_nonexisting_path(predictions_out)
264 1
    clf = joblib.load(SKLEARN_CLASSIFIER)
265
    ###Load dataset
266 1
    single_df = pd.read_csv(hmmsearch_df, sep='\t', index_col=0)
267
    ###Predict
268 1
    class_probs = clf.predict_proba(single_df)
269
    ###Write output
270 1
    with open(predictions_out, 'w') as outfile:
271 1
        outfile.write('\t{}\t{}\n'.format('Virulent', 'Temperate'))
272 1
        for i in range(len(class_probs)):
273 1
            outfile.write('{}\t{}\t{}\n'.format(single_df.index[i], class_probs[i][0], class_probs[i][1]))
274 1
    return
275

276 1
def parse_cmd_line_args():
277 0
    import argparse 
278 0
    parser = argparse.ArgumentParser()
279 0
    parser.add_argument("-i", "--input_file",\
280
            required=True, help="Should be a valid path to a single genome (nucleotide) FASTA file containing only 1 record/contig.")
281 0
    parser.add_argument("-f", "--force_overwrite", action="store_true",\
282
            help="Whether to overwrite all existing files that will be created if they exist. Default is False")
283 0
    parser.add_argument("--multi_fasta", default=False, action="store_true",\
284
            help="By default, BACPHLIP assumes that the input file contains one genome (nucleotide) sequence record. "
285
                    "Users providing a multi_fasta input file must use this flag. Note that each record should be uniquely "
286
                    "named and should contain complete genomes for different phages. BACPHLIP should not be run on " 
287
                    "incomplete / fragmented genomes spanning mulitple records.")
288 0
    parser.add_argument("--local_hmmsearch", default=False,\
289
            help="By default, BACPHLIP assumes a system install of \"hmmsearch\". Use this flag to provide a custom path "
290
                    "to a local install of hmmsearch if necessary.")
291 0
    args = parser.parse_args()    
292 0
    return args
293

294 1
def run_pipeline(input_file_path, force_overwrite=False, local_hmmsearch=False):
295
    """
296
    Command-line implementation of the full BACPHLIP prediction pipeline. Currently implemented only for single genome
297
    inputs but lots of time could be saved during the classifier prediction step by implementing a batch option.
298
    """
299
    ### 
300 1
    six_frame_file = input_file_path + '.6frame'
301 1
    hmmsearch_file = input_file_path + '.hmmsearch'
302 1
    hmmsearch_df = input_file_path + '.hmmsearch.tsv'
303 1
    predictions_file = input_file_path + '.bacphlip'
304
    ###
305 1
    print('#################################################################################')
306 1
    print('Beginning BACPHLIP pipeline')
307 1
    six_frame_translate(input_file_path, six_frame_file, force_overwrite=force_overwrite)
308 1
    print('Finished six frame translation of genome (nucleotide) with output stored in {}'.format(six_frame_file))
309 1
    hmmsearch_py(six_frame_file, hmmsearch_file, force_overwrite=force_overwrite, local_hmmsearch=local_hmmsearch)
310 1
    print('Finished outside call to hmmsearch with output stored in {}'.format(hmmsearch_file))
311 1
    process_hmmsearch(hmmsearch_file, hmmsearch_df, force_overwrite=force_overwrite)
312 1
    print('Finished converting hmmsearch with output stored in {}'.format(hmmsearch_df))
313 1
    predict_lifestyle(hmmsearch_df, predictions_file, force_overwrite=force_overwrite)
314 1
    print('Finished with BACPHLIP predictions! Final output file stored in {}'.format(predictions_file))
315 1
    print('#################################################################################')
316

317 1
def compile_full_hmmsearch_df(output_file, save_dir, force_overwrite):
318 0
    if not force_overwrite:
319 0
        check_nonexisting_path(output_file)
320 0
    full_df = pd.DataFrame()
321 0
    for df_file in glob.glob(save_dir + '*.hmmsearch.tsv'):
322 0
        ind_name = df_file.split('/')[-1].split('.hmmsearch.tsv')[0]
323 0
        ind_df = pd.read_csv(df_file, sep='\t', index_col=0)
324 0
        ind_df.index = [ind_name]
325 0
        full_df = pd.concat((full_df, ind_df))
326 0
    full_df.to_csv(output_file, sep='\t')
327
    
328 0
    return
329

330 1
def run_pipeline_multi(input_file_path, force_overwrite=False, local_hmmsearch=False):
331
    """
332
    Command-line implementation of the full BACPHLIP prediction pipeline. Currently implemented only for single genome
333
    inputs but lots of time could be saved during the classifier prediction step by implementing a batch option.
334
    """
335
    #Create a temporary directory for the files to reside
336 0
    output_dir = input_file_path + '.BACPHLIP_DIR/'    
337
    
338 0
    print('#################################################################################')
339 0
    print('Beginning BACPHLIP pipeline')
340 0
    six_frame_translate(input_file_path, output_dir, multi_fasta=True, force_overwrite=force_overwrite)
341 0
    print('Finished six frame translation of all nucleotide records with outputs stored in {}'.format(output_dir))
342
    
343 0
    for six_frame_file in glob.glob(output_dir + '*.6frame'):
344 0
        indiv_file_path = six_frame_file.split('.6frame')[0]
345
        #
346 0
        hmmsearch_file = indiv_file_path + '.hmmsearch'
347 0
        hmmsearch_df = indiv_file_path + '.hmmsearch.tsv'
348
        #
349 0
        hmmsearch_py(six_frame_file, hmmsearch_file, force_overwrite=force_overwrite, local_hmmsearch=local_hmmsearch)
350
        #
351 0
        process_hmmsearch(hmmsearch_file, hmmsearch_df, force_overwrite=force_overwrite)
352
        
353 0
    print('Finished hmmsearch calls and processing for all records')
354
    
355 0
    all_hmmsearch_file = input_file_path + '.hmmsearch.tsv'
356 0
    compile_full_hmmsearch_df(all_hmmsearch_file, output_dir, force_overwrite=force_overwrite)
357

358 0
    predictions_file = input_file_path + '.bacphlip'
359 0
    predict_lifestyle(all_hmmsearch_file, predictions_file, force_overwrite=force_overwrite)
360 0
    print('Finished with BACPHLIP predictions! Final output file stored in {}'.format(predictions_file))
361 0
    print('#################################################################################')
362

363
#if __name__ == '__main__':
364
#    args = parse_cmd_line_args()
365
#    if args.multi-fasta:
366
#        run_pipeline_multi(args.input_file, force_overwrite=args.force_overwrite, local_hmmsearch=args.local_hmmsearch)
367
#    else:
368
#        run_pipeline(args.input_file, force_overwrite=args.force_overwrite, local_hmmsearch=args.local_hmmsearch)

Read our documentation on viewing source code .

Loading