immcantation · ssnn-airr · Jan 14, 2026 · Nov 14, 2025 · Nov 24, 2025 · Jan 12, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -42,11 +42,51 @@ jobs:
       - name: Download test data
         run: |
           wget https://zenodo.org/records/14679815/files/test.fastq.gz?download=1 -O test.fastq.gz
-          gunzip test.fastq.gz
+          # Keep both gzipped and uncompressed versions for testing
+          gunzip -c test.fastq.gz > test.fastq
 
       # Run CLI tests
-      - name: MaskPrimers extract
+      - name: MaskPrimers extract (uncompressed)
         run: |
           MaskPrimers.py extract --help
           MaskPrimers.py extract -s test.fastq --start 17 --len 10 --barcode --mode cut --log log.txt --nproc 4 --outdir output --outname MaskPrimers-extract --failed
+
+      # Test gzip functionality
+      - name: MaskPrimers extract (gzip input/output)
+        run: |
+          # Test gzip input with gzip output
+          MaskPrimers.py extract -s test.fastq.gz --start 17 --len 10 --barcode --mode cut --gzip-output --log log-gzip.txt --nproc 4 --outdir output --outname MaskPrimers-gzip --failed
+
+          # Verify at least one output file was created
+          if [ ! -f "output/MaskPrimers-gzip_primers-pass.fastq.gz" ] && [ ! -f "output/MaskPrimers-gzip_primers-fail.fastq.gz" ]; then
+            echo "❌ ERROR: No compressed output files were created!"
+            exit 1
+          fi
+
+      # Compare results between uncompressed and gzipped processing
+      - name: Compare uncompressed vs gzipped results
+        run: |
+          echo "Comparing results between uncompressed and gzipped processing..."
+
+          python3 -c "
+          from Bio import SeqIO
+          from presto.IO import openFile
+
+          # Read both files using Presto's openFile
+          with openFile('output/MaskPrimers-extract_primers-pass.fastq', 'r') as handle:
+              uncomp_records = {(rec.id, str(rec.seq)) for rec in SeqIO.parse(handle, 'fastq')}
+
+          with openFile('output/MaskPrimers-gzip_primers-pass.fastq.gz', 'r') as handle:
+              gzip_records = {(rec.id, str(rec.seq)) for rec in SeqIO.parse(handle, 'fastq')}
+
+          print(f'Uncompressed records: {len(uncomp_records)}')
+          print(f'Gzipped records: {len(gzip_records)}')
+
+          if uncomp_records == gzip_records:
+              print('✅ SUCCESS: Files contain identical sequences')
+          else:
+              print('❌ ERROR: Files contain different sequences!')
+              exit(1)
+          "
+
 
diff --git a/bin/AlignSets.py b/bin/AlignSets.py
@@ -25,7 +25,7 @@
 from presto.Annotation import parseAnnotation
 from presto.Applications import runMuscle
 from presto.Sequence import calculateDiversity, indexSeqSets
-from presto.IO import readPrimerFile, getOutputHandle, printLog, printWarning, printError
+from presto.IO import readPrimerFile, getOutputHandle, openFile, printLog, printWarning, printError
 from presto.Multiprocessing import SeqResult, manageProcesses, feedSeqQueue, \
                                    collectSeqQueue
 
@@ -167,14 +167,18 @@ def writeOffsetFile(primer_file, align_func=runMuscle, align_args={},
 
     # Open output handle
     if out_file is not None:
-        out_handle = open(out_file, 'w')
+        # For explicit output files, check if gzip is needed
+        if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
+            out_file = out_file + '.gz'
+        out_handle = openFile(out_file, 'w')
     else:
         out_tag = 'reverse' if reverse else 'forward'
         out_handle = getOutputHandle(primer_file,
                                      'offsets-%s' % out_tag,
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
-                                     out_type='tab')
+                                     out_type='tab',
+                                     gzip_output=out_args.get('gzip_output', False))
 
     # Write offset table
     for k, v in offset_dict.items():
@@ -427,7 +431,7 @@ def getArgParser():
                                          description='Create a 5\' offset table by primer multiple alignment.')
     group_table = parser_table.add_argument_group('alignment table generation arguments')
     group_table.add_argument('-p', action='store', dest='primer_file', required=True,
-                               help='A FASTA file containing primer sequences.')
+                               help='A FASTA file containing primer sequences. Supports both uncompressed and gzip-compressed (.gz) files.')
     group_table.add_argument('-o', action='store', dest='out_file', default=None,
                              help='''Explicit output file name. Note, this argument cannot be used with
                                   the --failed, --outdir, or --outname arguments. If unspecified, then

diff --git a/bin/ClusterSets.py b/bin/ClusterSets.py
@@ -25,7 +25,7 @@
 from presto.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
 from presto.Annotation import parseAnnotation, flattenAnnotation, mergeAnnotation
 from presto.Applications import runCDHit, runUClust, default_max_memory
-from presto.IO import countSeqFile, getFileType, getOutputHandle, printLog, printMessage, \
+from presto.IO import countSeqFile, getFileType, getOutputHandle, openFile, printLog, printMessage, \
                       printProgress, readSeqFile, printError, printWarning
 from presto.Sequence import indexSeqSets
 from presto.Multiprocessing import SeqResult, manageProcesses, feedSeqQueue, \
@@ -313,13 +313,17 @@ def _header(seq, cluster, field=cluster_field, prefix=cluster_prefix,
 
     # Open output file handles
     if out_file is not None:
-        pass_handle = open(out_file, 'w')
+        # For explicit output files, check if gzip is needed
+        if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
+            out_file = out_file + '.gz'
+        pass_handle = openFile(out_file, 'w')
     else:
         pass_handle = getOutputHandle(seq_file,
                                       'cluster-pass',
                                       out_dir=out_args['out_dir'],
                                       out_name=out_args['out_name'],
-                                      out_type=out_args['out_type'])
+                                      out_type=out_args['out_type'],
+                                      gzip_output=out_args.get('gzip_output', False))
 
     # Open indexed sequence file
     seq_dict = readSeqFile(seq_file, index=True)
@@ -443,13 +447,17 @@ def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
 
     # Open output file handles
     if out_file is not None:
-        pass_handle = open(out_file, 'w')
+        # For explicit output files, check if gzip is needed
+        if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
+            out_file = out_file + '.gz'
+        pass_handle = openFile(out_file, 'w')
     else:
         pass_handle = getOutputHandle(seq_file,
                                       'cluster-pass',
                                       out_dir=out_args['out_dir'],
                                       out_name=out_args['out_name'],
-                                      out_type=out_args['out_type'])
+                                      out_type=out_args['out_type'],
+                                      gzip_output=out_args.get('gzip_output', False))
 
     # Open indexed sequence file
     seq_dict = readSeqFile(seq_file, index=True)

diff --git a/bin/CollapseSeq.py b/bin/CollapseSeq.py
@@ -22,7 +22,7 @@
 from presto.Annotation import parseAnnotation, flattenAnnotation, mergeAnnotation, \
                               collapseAnnotation
 from presto.Sequence import checkSeqEqual
-from presto.IO import getFileType, readSeqFile, getOutputHandle, printLog, printProgress
+from presto.IO import getFileType, readSeqFile, getOutputHandle, openFile, printLog, printProgress
 
 # Default parameters
 default_max_missing = 0
@@ -243,13 +243,17 @@ def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
 
     # Open unique record output handle
     if out_file is not None:
-        pass_handle = open(out_file, 'w')
+        # For explicit output files, check if gzip is needed
+        if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
+            out_file = out_file + '.gz'
+        pass_handle = openFile(out_file, 'w')
     else:
         pass_handle = getOutputHandle(seq_file,
                                       'collapse-unique',
                                       out_dir=out_args['out_dir'],
                                       out_name=out_args['out_name'],
-                                      out_type=out_args['out_type'])
+                                      out_type=out_args['out_type'],
+                                      gzip_output=out_args.get('gzip_output', False))
     # Define log handle
     if out_args['log_file'] is None:
         log_handle = None
@@ -319,15 +323,17 @@ def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None,
     # Write sequence with high missing character counts
     if out_args['failed'] and not keep_missing:
         with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'],
-                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
+                             out_name=out_args['out_name'], out_type=out_args['out_type'],
+                             gzip_output=out_args.get('gzip_output', False)) \
                 as missing_handle:
             for k in search_keys:
                 SeqIO.write(seq_dict[k], missing_handle, out_args['out_type'])
 
     if out_args['failed']:
         # Write duplicate sequences
         with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'],
-                             out_name=out_args['out_name'], out_type=out_args['out_type']) \
+                             out_name=out_args['out_name'], out_type=out_args['out_type'],
+                             gzip_output=out_args.get('gzip_output', False)) \
                 as dup_handle:
             for k in dup_keys:
                 SeqIO.write(seq_dict[k], dup_handle, out_args['out_type'])

diff --git a/bin/ConvertHeaders.py b/bin/ConvertHeaders.py
@@ -21,7 +21,7 @@
                               convertSRAHeader, convertMIGECHeader
 from presto.Commandline import CommonHelpFormatter, checkArgs, getCommonArgParser, parseCommonArgs
 from presto.IO import getFileType, readSeqFile, countSeqFile, getOutputHandle, \
-                      printLog, printProgress
+                      openFile, printLog, printProgress
 
 
 def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_args=default_out_args):
@@ -61,13 +61,17 @@ def convertHeaders(seq_file, convert_func, convert_args={}, out_file=None, out_a
     # Wrapper for opening handles and writers
     def _open(x, out_file=out_file):
         if out_file is not None and x == 'pass':
-            handle = open(out_file, 'w')
+            # For explicit output files, check if gzip is needed
+            if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
+                out_file = out_file + '.gz'
+            handle = openFile(out_file, 'w')
         else:
             handle = getOutputHandle(seq_file,
                                      'convert-%s' % x,
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
-                                     out_type=out_args['out_type'])
+                                     out_type=out_args['out_type'],
+                                     gzip_output=out_args.get('gzip_output', False))
         return handle
 
     # Count records

diff --git a/bin/EstimateError.py b/bin/EstimateError.py
@@ -485,7 +485,8 @@ def writeResults(results, seq_file, out_args):
     dist_df[['all']] = dist_df[['all']].astype(int)
 
     # Write to tab delimited files
-    file_args = {'out_dir': out_args['out_dir'], 'out_name': out_args['out_name'], 'out_type': 'tab'}
+    file_args = {'out_dir': out_args['out_dir'], 'out_name': out_args['out_name'], 'out_type': 'tab',
+                 'gzip_output': out_args.get('gzip_output', False)}
     with getOutputHandle(seq_file, 'error-position', **file_args) as pos_handle, \
             getOutputHandle(seq_file, 'error-quality', **file_args) as qual_handle, \
             getOutputHandle(seq_file, 'error-nucleotide', **file_args) as nuc_handle, \
@@ -654,7 +655,8 @@ def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']):
     thresh_df = pd.DataFrame.from_dict({'thresh': {'ALL': dist_df.index[int(np.mean([index for index in np.argsort(window) \
                                                                                      if dist[index] == np.min(window)]))]}
                                         })
-    file_args = {'out_dir':out_args['out_dir'], 'out_name':out_args['out_name'], 'out_type':'tab'}
+    file_args = {'out_dir':out_args['out_dir'], 'out_name':out_args['out_name'], 'out_type':'tab',
+                 'gzip_output': out_args.get('gzip_output', False)}
 
     # Output as tsv
     with getOutputHandle(seq_file, 'distance-barcode', **file_args) as dist_handle, \
@@ -755,7 +757,7 @@ def getArgParser():
     subparsers.required = True
 
     # Error profiling arguments for sets
-    parent_set = getCommonArgParser(failed=False, seq_out=False, log=True, out_file=False, multiproc=True)
+    parent_set = getCommonArgParser(failed=False, seq_out=True, log=True, out_file=False, multiproc=True)
     parser_set = subparsers.add_parser('set', parents=[parent_set],
                                        formatter_class=CommonHelpFormatter, add_help=False,
                                        help='Estimates error statistics within annotation sets.',
@@ -785,7 +787,7 @@ def getArgParser():
     parser_set.set_defaults(func=estimateSets)
 
     # Error profiling arguments for barcodes
-    parent_barcode = getCommonArgParser(failed=False, seq_out=False, log=False, out_file=False, multiproc=False)
+    parent_barcode = getCommonArgParser(failed=False, seq_out=True, log=False, out_file=False, multiproc=False)
     parser_barcode = subparsers.add_parser('barcode', parents=[parent_barcode],
                                            formatter_class=CommonHelpFormatter, add_help=False,
                                            help='Calculates pairwise distance metrics of barcode sequences.',

diff --git a/bin/MaskPrimers.py b/bin/MaskPrimers.py
@@ -349,7 +349,7 @@ def getArgParser():
                                          description='Find primer matches using pairwise local alignment.')
     group_align = parser_align.add_argument_group('primer alignment arguments')
     group_align.add_argument('-p', action='store', dest='primer_file', required=True,
-                              help='A FASTA file containing primer sequences.')
+                              help='A FASTA file containing primer sequences. Supports both uncompressed and gzip-compressed (.gz) files.')
     group_align.add_argument('--maxerror', action='store', dest='max_error', type=float,
                              default=default_primer_max_error, help='Maximum allowable error rate.')
     group_align.add_argument('--maxlen', action='store', dest='max_len', type=int,
@@ -399,7 +399,7 @@ def getArgParser():
                                          description='Find primer matches by scoring primers at a fixed position.')
     group_score = parser_score.add_argument_group('primer scoring arguments')
     group_score.add_argument('-p', action='store', dest='primer_file', required=True,
-                              help='A FASTA file containing primer sequences.')
+                              help='A FASTA file containing primer sequences. Supports both uncompressed and gzip-compressed (.gz) files.')
     group_score.add_argument('--start', action='store', dest='start', type=int, default=default_primer_start,
                              help='The starting position of the primer.')
     group_score.add_argument('--maxerror', action='store', dest='max_error', type=float,

diff --git a/bin/PairSeq.py b/bin/PairSeq.py
@@ -83,15 +83,19 @@ def _key_func(x):
 
     # Open output file handles
     pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'],
-                                    out_name=out_name_1, out_type=out_type_1)
+                                    out_name=out_name_1, out_type=out_type_1, 
+                                    gzip_output=out_args.get('gzip_output', False))
     pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'],
-                                    out_name=out_name_2, out_type=out_type_2)
+                                    out_name=out_name_2, out_type=out_type_2,
+                                    gzip_output=out_args.get('gzip_output', False))
 
     if out_args['failed']:
         fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'],
-                                        out_name=out_name_1, out_type=out_type_1)
+                                        out_name=out_name_1, out_type=out_type_1,
+                                        gzip_output=out_args.get('gzip_output', False))
         fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'],
-                                        out_name=out_name_2, out_type=out_type_2)
+                                        out_name=out_name_2, out_type=out_type_2,
+                                        gzip_output=out_args.get('gzip_output', False))
         pass_keys = list()
 
     # Iterate over pairs and write to output files

diff --git a/bin/ParseHeaders.py b/bin/ParseHeaders.py
@@ -22,7 +22,7 @@
                               addHeader, collapseHeader, copyHeader, deleteHeader, \
                               expandHeader, mergeHeader, renameHeader
 from presto.IO import getFileType, readSeqFile, countSeqFile, getOutputHandle, \
-                      printLog, printProgress
+                      openFile, printLog, printProgress
 
 
 def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args):
@@ -62,13 +62,17 @@ def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=de
     seq_iter = readSeqFile(seq_file)
     if out_args['out_type'] is None:  out_args['out_type'] = in_type
     if out_file is not None:
-        out_handle = open(out_file, 'w')
+        # For explicit output files, check if gzip is needed
+        if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
+            out_file = out_file + '.gz'
+        out_handle = openFile(out_file, 'w')
     else:
         out_handle = getOutputHandle(seq_file,
                                      'reheader',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
-                                     out_type=out_args['out_type'])
+                                     out_type=out_args['out_type'],
+                                     gzip_output=out_args.get('gzip_output', False))
     # Count records
     result_count = countSeqFile(seq_file)
 
@@ -127,13 +131,17 @@ def tableHeaders(seq_file, fields, out_file=None, out_args=default_out_args):
     # Open file handles
     seq_iter = readSeqFile(seq_file)
     if out_file is not None:
-        out_handle = open(out_file, 'w')
+        # For explicit output files, check if gzip is needed
+        if out_args.get('gzip_output', False) and not out_file.endswith('.gz'):
+            out_file = out_file + '.gz'
+        out_handle = openFile(out_file, 'w')
     else:
         out_handle = getOutputHandle(seq_file,
                                      'headers',
                                      out_dir=out_args['out_dir'],
                                      out_name=out_args['out_name'],
-                                     out_type='tab')
+                                     out_type='tab',
+                                     gzip_output=out_args.get('gzip_output', False))
     # Count records
     result_count = countSeqFile(seq_file)