Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions func_tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,27 @@ file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/smR2.fastq.gz
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config_sm.txt
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config.tsv
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config_.tsv
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config__.tsv
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config___.tsv
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/input.fastq
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/input_.fastq
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/input__.fastq
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/input___.fastq
DESTINATION ${CMAKE_CURRENT_BINARY_DIR})

10 changes: 10 additions & 0 deletions func_tests/config.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
groups ids tags distances locations previous
NYStgBot Y18 TCTCCTTACG 0:0:0 0:0:11 -
NYStgBot Y32 TGTAGTTCTA 0:1:1 0:0:11 -
OddBot3 O45 GCCTAGTAGAAGACGTT 2:2:2 0:11:0 {{NYStgBot}}4-10
EvenBot2 E41 ATAGATTGTTGCGTGCT 2:2:2 0:11:0 {{OddBot3}}4-10
OddBot2 O21 GGATAGCACCGTTCATT 1:1:1 0:11:0 {{EvenBot2}}4-10
EvenBot1 E2 TGTAGGTTCTGGAATAT 0:0:0 0:11:0 {{OddBot2}}4-10
OddBot1 O85 GCTGTGTCTGTCACCT 1:1:1 0:11:0 {{EvenBot1}}4-10
DPM_const DPM_const TCATGTCTTCCGATCT 2:0:2 0:11:0 {{OddBot1}}4-10
DPM_R2 DPM1 TGGGTGTTT 1:0:1 0:11:0 {DPM_const}0-0
4 changes: 4 additions & 0 deletions func_tests/config_.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
tags locations
TGTAGG 0:0:0
TGTAGGTT 0:0:0
TGGG 0:0:0
15 changes: 15 additions & 0 deletions func_tests/config__.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
ids tags distance
tagT TTT 0
tagT TTTT 0
tagT TTTTT 0
tagT TTTTTT 0
tagT TTTTTTT 0
tagT TTTTTTTT 0
tagT TTTTTTTTT 0
tagT TTTTTTTTTT 1
tagT TTTTTTTTTTT 1
tagT TTTTTTTTTTTT 1
tagT TTTTTTTTTTTTT 1
tagT TTTTTTTTTTTTTT 1
tagT TTTTTTTTTTTTTTT 1

5 changes: 5 additions & 0 deletions func_tests/config___.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ids tags distance partial5
tagT TTTTTTTTTT 0 3:0.2
tagA AAAAAAAAAA 1 3:0.2
tagG GGGGGGGGGG 1 -

4 changes: 4 additions & 0 deletions func_tests/input.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@r1
TCTCCTTACGGACAACTGCCTAGTAGAAGACGTTTGACTTGATAGATTGTTGCGTGCTCACAACTGGATAGCACCGTTCATTTGACTTGTGTAGGTTCTGGAATATGACAACTGCTGTGTCTGTCACCTTTGACTTGTCA
+
TCTCCTTACGGACAACTGCCTAGTAGAAGACGTTTGACTTGATAGATTGTTGCGTGCTCACAACTGGATAGCACCGTTCATTTGACTTGTGTAGGTTCTGGAATATGACAACTGCTGTGTCTGTCACCTTTGACTTGTCA
4 changes: 4 additions & 0 deletions func_tests/input_.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@r
NNNNNNNNNNTGACTTGTGTAGGTTCTGGAAT
+
NNNNNNNNNNTGACTTGTGTAGGTTCTGGAAT
21 changes: 21 additions & 0 deletions func_tests/input__.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
@11_bp_no_mismatch
AAATTTTTTTTTTTAAA
+
;;;;;;;;;;;;;;;;;
@11_bp_1_internal_mismatch
AAATTTTTTTTTCTAAA
+
;;;;;;;;;;;;;;;;;
@11_bp_1_internal_mismatch2
AAATTTTTTTTCTTAAA
+
;;;;;;;;;;;;;;;;;
@11_bp_1_5prime_mismatch
AAACTTTTTTTTTTAAA
+
;;;;;;;;;;;;;;;;;
@11_bp_1_3prime_mismatch
AAATTTTTTTTTTCAAA
+
;;;;;;;;;;;;;;;;;

41 changes: 41 additions & 0 deletions func_tests/input___.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
@read1 5prime tagT
TTTTTTTTTT
+
;;;;;;;;;;
@read2 5prime tagT with 3prime substitution
TTTTTTTTTC
+
;;;;;;;;;;
@read3 5prime tagT with interior substitution
TTTCTTTTTT
+
;;;;;;;;;;
@read4 5prime tagT with interior substitution within first 3 bp
TTCTTTTTTT
+
;;;;;;;;;;
@read5 5prime tagT with interior substitution within first 3 bp
TCTTTTTTTT
+
;;;;;;;;;;
@read6 5prime tagT with 5prime substitution
CTTTTTTTTT
+
;;;;;;;;;;
@read7 5prime tagT with 5prime and 3prime substitutions
CTTTTTTTTC
+
;;;;;;;;;;
@read8 5prime tagT with 5prime and interior substitutions
CTTTTTTCTT
+
;;;;;;;;;;
@read9 5prime tagA with 5prime and interior substitutions; like read3
AAACAAAAAA
+
;;;;;;;;;;
@read10 5prime tagG with 5prime and interior substitutions; like read3
GGGCGGGGGG
+
;;;;;;;;;;

31 changes: 19 additions & 12 deletions func_tests/runtests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,16 +94,6 @@ checkcmdoutput "cat $test_dir/mx.txt" b95e8b332c8a0a7ffc0f91118e754302

checkcmdoutput "$splitcode --assign --x-only --nFastqs=2 --empty N -x \"0:0<R1>0:-1,<R1[10]>{adapter},{adapter}<R1[1-65]>,2:0<R2[1-65]>\" --gzip --mod-names --bclen=20 -t 1 -c $test_dir/config_sm.txt --mapping=/dev/null --pipe $test_dir/smR1.fastq.gz $test_dir/smR2.fastq.gz" 8fc440842f4e2922976cdc1b165008b1

# Test lift workflow

checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_1.vcf.gz CAST_EiJ --kmer-length=31 --kmer-output=$test_dir/test.kmers.1.txt" 7e9c1d67efdf7113bfab367cb5d2d640
checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ --rename --kmer-length=31 --kmer-output=$test_dir/test.kmers.2.txt" c9a91833da19b20383d6bd1d3e32ff8f
checkcmdoutput "cat $test_dir/test.kmers.1.txt" 54d189f4549f6b35ea80ec5c167332b7
checkcmdoutput "cat $test_dir/test.kmers.2.txt" 9982de087ed358724580836600cc9ba7
checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ --diploid --kmer-length=31 --kmer-output=$test_dir/test.kmers.2.txt" 6c33bd3ba7cd9aaefeca5cbaa272cdfe
checkcmdoutput "cat $test_dir/test.kmers.2.txt" a4c73b67a0ad6e5094ff7f2dfdda15bc
checkcmdoutput "$splitcode --lift --snv-only $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ" 4bdcaf9f34da45033d477651d8845bf2
checkcmdoutput "$splitcode --lift --kmer-sj $test_dir/vcf_validation.fa.gz $test_dir/example.SJ.tab --kmer-length=31 --kmer-header=X_ --kmer-header-num" 1e88d0b72323a6c11faa73e19007fc5f


# Test from-name, random, and revcomp
Expand Down Expand Up @@ -239,14 +229,21 @@ checkcmdoutput "$splitcode --trim-only --pipe -q 10 --qtrim-3 -s $test_dir/test_
cmdexec "wc $test_dir/test_summary.txt"
checkcmdoutput "$splitcode --trim-only --pipe -q 10 --qtrim-3 --qtrim-pre -5 5 -E ATCG $test_dir/test.fq" 1b5a09bd343382ee78c9aa51245557c2

# Some fixes (Dec. 2025)

checkcmdoutput "$splitcode --trim-only --pipe -c $test_dir/config.tsv --loc-names --out-fasta --nFastqs 1 $test_dir/input.fastq" a4508f24c69838b07cec04978def09ca
checkcmdoutput "$splitcode --trim-only --pipe -c $test_dir/config_.tsv --loc-names --out-fasta --nFastqs 1 $test_dir/input_.fastq" 200fb9106e47c741b1891e4b2fedef05
checkcmdoutput "$splitcode --trim-only --pipe -c $test_dir/config__.tsv --loc-names --out-fasta --nFastqs 1 $test_dir/input__.fastq" 4caa199550f26000ac787821cc62668c
checkcmdoutput "$splitcode --trim-only --pipe -c $test_dir/config___.tsv --loc-names --out-fasta --nFastqs 1 $test_dir/input___.fastq" 209698dea910ed4d4cd05dbce0889800

# Adapter trimming tests

checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=3:0.35 --left=1 --pipe $test_dir/test.fq" b637fbabe71eb90bb9b3399a17eabef7
checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=3:0.34 --left=1 --pipe $test_dir/test.fq" b637fbabe71eb90bb9b3399a17eabef7
checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=3:0.33 --left=1 --pipe $test_dir/test.fq" cb52b79ed7469ca2ffe5739ec544b157
checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=4:0.34 --left=1 --pipe $test_dir/test.fq" cb52b79ed7469ca2ffe5739ec544b157
checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=2:0.34 --left=1 --pipe $test_dir/test.fq" c6eba12c36e53301f23a9823c2901f24
checkcmdoutput "$splitcode --trim-only -b CCAAA,CCGGAA --partial5=2:0.34, --partial3=,4 --left=1,0 --right=0,1 --pipe $test_dir/test.fq" 5d4541fb96da328d07ab9189216cf4a5
checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=2:0.34 --left=1 --pipe $test_dir/test.fq" b637fbabe71eb90bb9b3399a17eabef7
checkcmdoutput "$splitcode --trim-only -b CCAAA,CCGGAA --partial5=2:0.34, --partial3=,4 --left=1,0 --right=0,1 --pipe $test_dir/test.fq" 93f1726415edb410d5e733603bc4be11
checkcmdoutput "$splitcode --trim-only -b CCGC -l 0:-4:0 --partial3=4:0.25 --right=1 --pipe $test_dir/test.fq" 11b55a195b5976331305569416db5bd4
checkcmdoutput "$splitcode --trim-only -b CCGG,CCGG -i a,b -l 0:-4:9,1:-4:10 --partial3=2,2 --right=1,1 -N 2 --pipe $test_dir/test.fq $test_dir/test.fq" 6807e3ba911fde8fb437f693d055c11f
checkcmdoutput "$splitcode --trim-only -b CCGC,GAAG -a ,{CCGC} -v {GAAG}, -l 0:-4:0, --partial3=4:0.25,3 --partial5=4,3 --right=1,0 --left=0,1 --pipe $test_dir/test.fq" 93f1726415edb410d5e733603bc4be11
Expand Down Expand Up @@ -553,4 +550,14 @@ TATTATGGTCCCCCCCCTTCGTGGAATCTAGCTGACTTGTGACTAGCTDGGGGGGGGGG" > $test_dir/test_te

checkcmdoutput "$splitcode --assign -m /dev/null --mod-names -g PART,RPM,RPM,ODD,Y -i Part,RTBC2,RTBC4,Odd2Bo1,NYBot1_Stg -b TGACTTG,TTTTTTT,GGGGGGG,TTCGTGGAATCTAGC,TATTATGGT --maxFindsG=Y:1 -p $test_dir/test_term.fq" 79ad1ca9184b274e1396b5f2220cbade

# Test lift workflow

checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_1.vcf.gz CAST_EiJ --kmer-length=31 --kmer-output=$test_dir/test.kmers.1.txt" 7e9c1d67efdf7113bfab367cb5d2d640
checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ --rename --kmer-length=31 --kmer-output=$test_dir/test.kmers.2.txt" c9a91833da19b20383d6bd1d3e32ff8f
checkcmdoutput "cat $test_dir/test.kmers.1.txt" 54d189f4549f6b35ea80ec5c167332b7
checkcmdoutput "cat $test_dir/test.kmers.2.txt" 9982de087ed358724580836600cc9ba7
checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ --diploid --kmer-length=31 --kmer-output=$test_dir/test.kmers.2.txt" 6c33bd3ba7cd9aaefeca5cbaa272cdfe
checkcmdoutput "cat $test_dir/test.kmers.2.txt" a4c73b67a0ad6e5094ff7f2dfdda15bc
checkcmdoutput "$splitcode --lift --snv-only $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ" 4bdcaf9f34da45033d477651d8845bf2
checkcmdoutput "$splitcode --lift --kmer-sj $test_dir/vcf_validation.fa.gz $test_dir/example.SJ.tab --kmer-length=31 --kmer-header=X_ --kmer-header-num" 1e88d0b72323a6c11faa73e19007fc5f

6 changes: 3 additions & 3 deletions src/SplitCode.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef SPLITCODE_H
#define SPLITCODE_H

#define SPLITCODE_VERSION "0.31.4"
#define SPLITCODE_VERSION "0.31.5"

#include <string>
#include <iostream>
Expand Down Expand Up @@ -2257,7 +2257,7 @@ struct SplitCode {
if (updated_name_id != name_id_curr) {
return false; // multiple tags of different names
}
if (updated_error >= error_prev) { // Choose smallest error first when deciding if to update to larger k
if (true /* updated_error >= error_prev */) { // Edit: Always update to larger k if tags have same name (regardless of mismatch error)
updated_tag_id = tag_id_curr;
updated_k = curr_k; // Update to larger k
updated_error = error_prev;
Expand Down Expand Up @@ -4979,7 +4979,7 @@ struct SplitCode {
std::vector<placement_struct> placement_vec;
std::string from_header_str;

std::vector<std::unordered_set<size_t>> k_expansions; // Keeps track of all possible substring/k-mer lengths for each file (file number is the index)
std::vector<std::set<size_t>> k_expansions; // Keeps track of all possible substring/k-mer lengths for each file (file number is the index)

bool init;
bool discard_check;
Expand Down