Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 42 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,51 @@ jobs:
- name: Download test data
run: |
wget https://zenodo.org/records/14679815/files/test.fastq.gz?download=1 -O test.fastq.gz
gunzip test.fastq.gz
# Keep both gzipped and uncompressed versions for testing
gunzip -c test.fastq.gz > test.fastq

# Run CLI tests
- name: MaskPrimers extract
- name: MaskPrimers extract (uncompressed)
run: |
MaskPrimers.py extract --help
MaskPrimers.py extract -s test.fastq --start 17 --len 10 --barcode --mode cut --log log.txt --nproc 4 --outdir output --outname MaskPrimers-extract --failed

# Test gzip functionality
- name: MaskPrimers extract (gzip input/output)
run: |
# Test gzip input with gzip output
MaskPrimers.py extract -s test.fastq.gz --start 17 --len 10 --barcode --mode cut --gzip-output --log log-gzip.txt --nproc 4 --outdir output --outname MaskPrimers-gzip --failed

# Verify at least one output file was created
if [ ! -f "output/MaskPrimers-gzip_primers-pass.fastq.gz" ] && [ ! -f "output/MaskPrimers-gzip_primers-fail.fastq.gz" ]; then
echo "❌ ERROR: No compressed output files were created!"
exit 1
fi

# Compare results between uncompressed and gzipped processing
- name: Compare uncompressed vs gzipped results
run: |
echo "Comparing results between uncompressed and gzipped processing..."

python3 -c "
from Bio import SeqIO
from presto.IO import openFile

# Read both files using Presto's openFile
with openFile('output/MaskPrimers-extract_primers-pass.fastq', 'r') as handle:
uncomp_records = {(rec.id, str(rec.seq)) for rec in SeqIO.parse(handle, 'fastq')}

with openFile('output/MaskPrimers-gzip_primers-pass.fastq.gz', 'r') as handle:
gzip_records = {(rec.id, str(rec.seq)) for rec in SeqIO.parse(handle, 'fastq')}

print(f'Uncompressed records: {len(uncomp_records)}')
print(f'Gzipped records: {len(gzip_records)}')

if uncomp_records == gzip_records:
print('✅ SUCCESS: Files contain identical sequences')
else:
print('❌ ERROR: Files contain different sequences!')
exit(1)
"


12 changes: 8 additions & 4 deletions bin/AlignSets.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from presto.Annotation import parseAnnotation
from presto.Applications import runMuscle
from presto.Sequence import calculateDiversity, indexSeqSets
from presto.IO import readPrimerFile, getOutputHandle, printLog, printWarning, printError
from presto.IO import readPrimerFile, getOutputHandle, openFile, printLog, printWarning, printError
from presto.Multiprocessing import SeqResult, manageProcesses, feedSeqQueue, \
collectSeqQueue

Expand Down Expand Up @@ -167,14 +167,18 @@ def writeOffsetFile(primer_file, align_func=runMuscle, align_args={},

# Open output handle
if out_file is not None:
out_handle = open(out_file, 'w')
# For explicit output files, check if gzip is needed
if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
out_file = out_file + '.gz'
out_handle = openFile(out_file, 'w')
else:
out_tag = 'reverse' if reverse else 'forward'
out_handle = getOutputHandle(primer_file,
'offsets-%s' % out_tag,
out_dir=out_args['out_dir'],
out_name=out_args['out_name'],
out_type='tab')
out_type='tab',
gzip_output=out_args.get('gzip_output', False))

# Write offset table
for k, v in offset_dict.items():
Expand Down Expand Up @@ -427,7 +431,7 @@ def getArgParser():
description='Create a 5\' offset table by primer multiple alignment.')
group_table = parser_table.add_argument_group('alignment table generation arguments')
group_table.add_argument('-p', action='store', dest='primer_file', required=True,
help='A FASTA file containing primer sequences.')
help='A FASTA file containing primer sequences. Supports both uncompressed and gzip-compressed (.gz) files.')
group_table.add_argument('-o', action='store', dest='out_file', default=None,
help='''Explicit output file name. Note, this argument cannot be used with
the --failed, --outdir, or --outname arguments. If unspecified, then
Expand Down
18 changes: 13 additions & 5 deletions bin/ClusterSets.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from presto.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
from presto.Annotation import parseAnnotation, flattenAnnotation, mergeAnnotation
from presto.Applications import runCDHit, runUClust, default_max_memory
from presto.IO import countSeqFile, getFileType, getOutputHandle, printLog, printMessage, \
from presto.IO import countSeqFile, getFileType, getOutputHandle, openFile, printLog, printMessage, \
printProgress, readSeqFile, printError, printWarning
from presto.Sequence import indexSeqSets
from presto.Multiprocessing import SeqResult, manageProcesses, feedSeqQueue, \
Expand Down Expand Up @@ -313,13 +313,17 @@ def _header(seq, cluster, field=cluster_field, prefix=cluster_prefix,

# Open output file handles
if out_file is not None:
pass_handle = open(out_file, 'w')
# For explicit output files, check if gzip is needed
if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
out_file = out_file + '.gz'
pass_handle = openFile(out_file, 'w')
else:
pass_handle = getOutputHandle(seq_file,
'cluster-pass',
out_dir=out_args['out_dir'],
out_name=out_args['out_name'],
out_type=out_args['out_type'])
out_type=out_args['out_type'],
gzip_output=out_args.get('gzip_output', False))

# Open indexed sequence file
seq_dict = readSeqFile(seq_file, index=True)
Expand Down Expand Up @@ -443,13 +447,17 @@ def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):

# Open output file handles
if out_file is not None:
pass_handle = open(out_file, 'w')
# For explicit output files, check if gzip is needed
if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
out_file = out_file + '.gz'
pass_handle = openFile(out_file, 'w')
else:
pass_handle = getOutputHandle(seq_file,
'cluster-pass',
out_dir=out_args['out_dir'],
out_name=out_args['out_name'],
out_type=out_args['out_type'])
out_type=out_args['out_type'],
gzip_output=out_args.get('gzip_output', False))

# Open indexed sequence file
seq_dict = readSeqFile(seq_file, index=True)
Expand Down
16 changes: 11 additions & 5 deletions bin/CollapseSeq.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from presto.Annotation import parseAnnotation, flattenAnnotation, mergeAnnotation, \
collapseAnnotation
from presto.Sequence import checkSeqEqual
from presto.IO import getFileType, readSeqFile, getOutputHandle, printLog, printProgress
from presto.IO import getFileType, readSeqFile, getOutputHandle, openFile, printLog, printProgress

# Default parameters
default_max_missing = 0
Expand Down Expand Up @@ -243,13 +243,17 @@ def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,

# Open unique record output handle
if out_file is not None:
pass_handle = open(out_file, 'w')
# For explicit output files, check if gzip is needed
if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
out_file = out_file + '.gz'
pass_handle = openFile(out_file, 'w')
else:
pass_handle = getOutputHandle(seq_file,
'collapse-unique',
out_dir=out_args['out_dir'],
out_name=out_args['out_name'],
out_type=out_args['out_type'])
out_type=out_args['out_type'],
gzip_output=out_args.get('gzip_output', False))
# Define log handle
if out_args['log_file'] is None:
log_handle = None
Expand Down Expand Up @@ -319,15 +323,17 @@ def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
# Write sequence with high missing character counts
if out_args['failed'] and not keep_missing:
with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
out_name=out_args['out_name'], out_type=out_args['out_type']) \
out_name=out_args['out_name'], out_type=out_args['out_type'],
gzip_output=out_args.get('gzip_output', False)) \
as missing_handle:
for k in search_keys:
SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])

if out_args['failed']:
# Write duplicate sequences
with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'],
out_name=out_args['out_name'], out_type=out_args['out_type']) \
out_name=out_args['out_name'], out_type=out_args['out_type'],
gzip_output=out_args.get('gzip_output', False)) \
as dup_handle:
for k in dup_keys:
SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])
Expand Down
10 changes: 7 additions & 3 deletions bin/ConvertHeaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
convertSRAHeader, convertMIGECHeader
from presto.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
from presto.IO import getFileType, readSeqFile, countSeqFile, getOutputHandle, \
printLog, printProgress
openFile, printLog, printProgress


def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_args=default_out_args):
Expand Down Expand Up @@ -61,13 +61,17 @@ def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_a
# Wrapper for opening handles and writers
def _open(x, out_file=out_file):
if out_file is not None and x == 'pass':
handle = open(out_file, 'w')
# For explicit output files, check if gzip is needed
if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
out_file = out_file + '.gz'
handle = openFile(out_file, 'w')
else:
handle = getOutputHandle(seq_file,
'convert-%s' % x,
out_dir=out_args['out_dir'],
out_name=out_args['out_name'],
out_type=out_args['out_type'])
out_type=out_args['out_type'],
gzip_output=out_args.get('gzip_output', False))
return handle

# Count records
Expand Down
10 changes: 6 additions & 4 deletions bin/EstimateError.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,8 @@ def writeResults(results, seq_file, out_args):
dist_df[['all']] = dist_df[['all']].astype(int)

# Write to tab delimited files
file_args = {'out_dir': out_args['out_dir'], 'out_name': out_args['out_name'], 'out_type': 'tab'}
file_args = {'out_dir': out_args['out_dir'], 'out_name': out_args['out_name'], 'out_type': 'tab',
'gzip_output': out_args.get('gzip_output', False)}
with getOutputHandle(seq_file, 'error-position', **file_args) as pos_handle, \
getOutputHandle(seq_file, 'error-quality', **file_args) as qual_handle, \
getOutputHandle(seq_file, 'error-nucleotide', **file_args) as nuc_handle, \
Expand Down Expand Up @@ -654,7 +655,8 @@ def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
thresh_df = pd.DataFrame.from_dict({'thresh': {'ALL': dist_df.index[int(np.mean([index for index in np.argsort(window) \
if dist[index] == np.min(window)]))]}
})
file_args = {'out_dir':out_args['out_dir'], 'out_name':out_args['out_name'], 'out_type':'tab'}
file_args = {'out_dir':out_args['out_dir'], 'out_name':out_args['out_name'], 'out_type':'tab',
'gzip_output': out_args.get('gzip_output', False)}

# Output as tsv
with getOutputHandle(seq_file, 'distance-barcode', **file_args) as dist_handle, \
Expand Down Expand Up @@ -755,7 +757,7 @@ def getArgParser():
subparsers.required = True

# Error profiling arguments for sets
parent_set = getCommonArgParser(failed=False, seq_out=False, log=True, out_file=False, multiproc=True)
parent_set = getCommonArgParser(failed=False, seq_out=True, log=True, out_file=False, multiproc=True)
parser_set = subparsers.add_parser('set', parents=[parent_set],
formatter_class=CommonHelpFormatter, add_help=False,
help='Estimates error statistics within annotation sets.',
Expand Down Expand Up @@ -785,7 +787,7 @@ def getArgParser():
parser_set.set_defaults(func=estimateSets)

# Error profiling arguments for barcodes
parent_barcode = getCommonArgParser(failed=False, seq_out=False, log=False, out_file=False, multiproc=False)
parent_barcode = getCommonArgParser(failed=False, seq_out=True, log=False, out_file=False, multiproc=False)
parser_barcode = subparsers.add_parser('barcode', parents=[parent_barcode],
formatter_class=CommonHelpFormatter, add_help=False,
help='Calculates pairwise distance metrics of barcode sequences.',
Expand Down
4 changes: 2 additions & 2 deletions bin/MaskPrimers.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def getArgParser():
description='Find primer matches using pairwise local alignment.')
group_align = parser_align.add_argument_group('primer alignment arguments')
group_align.add_argument('-p', action='store', dest='primer_file', required=True,
help='A FASTA file containing primer sequences.')
help='A FASTA file containing primer sequences. Supports both uncompressed and gzip-compressed (.gz) files.')
group_align.add_argument('--maxerror', action='store', dest='max_error', type=float,
default=default_primer_max_error, help='Maximum allowable error rate.')
group_align.add_argument('--maxlen', action='store', dest='max_len', type=int,
Expand Down Expand Up @@ -399,7 +399,7 @@ def getArgParser():
description='Find primer matches by scoring primers at a fixed position.')
group_score = parser_score.add_argument_group('primer scoring arguments')
group_score.add_argument('-p', action='store', dest='primer_file', required=True,
help='A FASTA file containing primer sequences.')
help='A FASTA file containing primer sequences. Supports both uncompressed and gzip-compressed (.gz) files.')
group_score.add_argument('--start', action='store', dest='start', type=int, default=default_primer_start,
help='The starting position of the primer.')
group_score.add_argument('--maxerror', action='store', dest='max_error', type=float,
Expand Down
12 changes: 8 additions & 4 deletions bin/PairSeq.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,19 @@ def _key_func(x):

# Open output file handles
pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'],
out_name=out_name_1, out_type=out_type_1)
out_name=out_name_1, out_type=out_type_1,
gzip_output=out_args.get('gzip_output', False))
pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'],
out_name=out_name_2, out_type=out_type_2)
out_name=out_name_2, out_type=out_type_2,
gzip_output=out_args.get('gzip_output', False))

if out_args['failed']:
fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'],
out_name=out_name_1, out_type=out_type_1)
out_name=out_name_1, out_type=out_type_1,
gzip_output=out_args.get('gzip_output', False))
fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'],
out_name=out_name_2, out_type=out_type_2)
out_name=out_name_2, out_type=out_type_2,
gzip_output=out_args.get('gzip_output', False))
pass_keys = list()

# Iterate over pairs and write to output files
Expand Down
18 changes: 13 additions & 5 deletions bin/ParseHeaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
addHeader, collapseHeader, copyHeader, deleteHeader, \
expandHeader, mergeHeader, renameHeader
from presto.IO import getFileType, readSeqFile, countSeqFile, getOutputHandle, \
printLog, printProgress
openFile, printLog, printProgress


def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args):
Expand Down Expand Up @@ -62,13 +62,17 @@ def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=de
seq_iter = readSeqFile(seq_file)
if out_args['out_type'] is None: out_args['out_type'] = in_type
if out_file is not None:
out_handle = open(out_file, 'w')
# For explicit output files, check if gzip is needed
if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
out_file = out_file + '.gz'
out_handle = openFile(out_file, 'w')
else:
out_handle = getOutputHandle(seq_file,
'reheader',
out_dir=out_args['out_dir'],
out_name=out_args['out_name'],
out_type=out_args['out_type'])
out_type=out_args['out_type'],
gzip_output=out_args.get('gzip_output', False))
# Count records
result_count = countSeqFile(seq_file)

Expand Down Expand Up @@ -127,13 +131,17 @@ def tableHeaders(seq_file, fields, out_file=None, out_args=default_out_args):
# Open file handles
seq_iter = readSeqFile(seq_file)
if out_file is not None:
out_handle = open(out_file, 'w')
# For explicit output files, check if gzip is needed
if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
out_file = out_file + '.gz'
out_handle = openFile(out_file, 'w')
else:
out_handle = getOutputHandle(seq_file,
'headers',
out_dir=out_args['out_dir'],
out_name=out_args['out_name'],
out_type='tab')
out_type='tab',
gzip_output=out_args.get('gzip_output', False))
# Count records
result_count = countSeqFile(seq_file)

Expand Down
Loading