diff --git a/func_tests/CMakeLists.txt b/func_tests/CMakeLists.txt index 0d6ac50..483f852 100644 --- a/func_tests/CMakeLists.txt +++ b/func_tests/CMakeLists.txt @@ -105,3 +105,27 @@ file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/smR2.fastq.gz file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config_sm.txt DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config.tsv + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config_.tsv + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config__.tsv + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/config___.tsv + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/input.fastq + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/input_.fastq + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/input__.fastq + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/input___.fastq + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + diff --git a/func_tests/config.tsv b/func_tests/config.tsv new file mode 100644 index 0000000..6b30873 --- /dev/null +++ b/func_tests/config.tsv @@ -0,0 +1,10 @@ +groups ids tags distances locations previous +NYStgBot Y18 TCTCCTTACG 0:0:0 0:0:11 - +NYStgBot Y32 TGTAGTTCTA 0:1:1 0:0:11 - +OddBot3 O45 GCCTAGTAGAAGACGTT 2:2:2 0:11:0 {{NYStgBot}}4-10 +EvenBot2 E41 ATAGATTGTTGCGTGCT 2:2:2 0:11:0 {{OddBot3}}4-10 +OddBot2 O21 GGATAGCACCGTTCATT 1:1:1 0:11:0 {{EvenBot2}}4-10 +EvenBot1 E2 TGTAGGTTCTGGAATAT 0:0:0 0:11:0 {{OddBot2}}4-10 +OddBot1 O85 GCTGTGTCTGTCACCT 1:1:1 0:11:0 {{EvenBot1}}4-10 +DPM_const DPM_const TCATGTCTTCCGATCT 2:0:2 0:11:0 {{OddBot1}}4-10 +DPM_R2 DPM1 TGGGTGTTT 1:0:1 0:11:0 {DPM_const}0-0 diff --git a/func_tests/config_.tsv b/func_tests/config_.tsv new file mode 100644 index 0000000..6fd3784 --- /dev/null +++ b/func_tests/config_.tsv @@ -0,0 +1,4 @@ +tags locations +TGTAGG 0:0:0 +TGTAGGTT 0:0:0 +TGGG 0:0:0 diff --git a/func_tests/config__.tsv b/func_tests/config__.tsv new file mode 100644 index 0000000..074c350 --- /dev/null +++ b/func_tests/config__.tsv @@ -0,0 +1,15 @@ +ids tags distance +tagT TTT 0 +tagT TTTT 0 +tagT TTTTT 0 +tagT TTTTTT 0 +tagT TTTTTTT 0 +tagT TTTTTTTT 0 +tagT TTTTTTTTT 0 +tagT TTTTTTTTTT 1 +tagT TTTTTTTTTTT 1 +tagT TTTTTTTTTTTT 1 +tagT TTTTTTTTTTTTT 1 +tagT TTTTTTTTTTTTTT 1 +tagT TTTTTTTTTTTTTTT 1 + diff --git a/func_tests/config___.tsv b/func_tests/config___.tsv new file mode 100644 index 0000000..f9cd592 --- /dev/null +++ b/func_tests/config___.tsv @@ -0,0 +1,5 @@ +ids tags distance partial5 +tagT TTTTTTTTTT 0 3:0.2 +tagA AAAAAAAAAA 1 3:0.2 +tagG GGGGGGGGGG 1 - + diff --git a/func_tests/input.fastq b/func_tests/input.fastq new file mode 100644 index 0000000..d0de569 --- /dev/null +++ b/func_tests/input.fastq @@ -0,0 +1,4 @@ +@r1 +TCTCCTTACGGACAACTGCCTAGTAGAAGACGTTTGACTTGATAGATTGTTGCGTGCTCACAACTGGATAGCACCGTTCATTTGACTTGTGTAGGTTCTGGAATATGACAACTGCTGTGTCTGTCACCTTTGACTTGTCA ++ +TCTCCTTACGGACAACTGCCTAGTAGAAGACGTTTGACTTGATAGATTGTTGCGTGCTCACAACTGGATAGCACCGTTCATTTGACTTGTGTAGGTTCTGGAATATGACAACTGCTGTGTCTGTCACCTTTGACTTGTCA diff --git a/func_tests/input_.fastq b/func_tests/input_.fastq new file mode 100644 index 0000000..4dd1b60 --- /dev/null +++ b/func_tests/input_.fastq @@ -0,0 +1,4 @@ +@r +NNNNNNNNNNTGACTTGTGTAGGTTCTGGAAT ++ +NNNNNNNNNNTGACTTGTGTAGGTTCTGGAAT diff --git a/func_tests/input__.fastq b/func_tests/input__.fastq new file mode 100644 index 0000000..1221ef1 --- /dev/null +++ b/func_tests/input__.fastq @@ -0,0 +1,21 @@ +@11_bp_no_mismatch +AAATTTTTTTTTTTAAA ++ +;;;;;;;;;;;;;;;;; +@11_bp_1_internal_mismatch +AAATTTTTTTTTCTAAA ++ +;;;;;;;;;;;;;;;;; +@11_bp_1_internal_mismatch2 +AAATTTTTTTTCTTAAA ++ +;;;;;;;;;;;;;;;;; +@11_bp_1_5prime_mismatch +AAACTTTTTTTTTTAAA ++ +;;;;;;;;;;;;;;;;; +@11_bp_1_3prime_mismatch +AAATTTTTTTTTTCAAA ++ +;;;;;;;;;;;;;;;;; + diff --git a/func_tests/input___.fastq b/func_tests/input___.fastq new file mode 100644 index 0000000..34edf7c --- /dev/null +++ b/func_tests/input___.fastq @@ -0,0 +1,41 @@ +@read1 5prime tagT +TTTTTTTTTT ++ +;;;;;;;;;; +@read2 5prime tagT with 3prime substitution +TTTTTTTTTC ++ +;;;;;;;;;; +@read3 5prime tagT with interior substitution +TTTCTTTTTT ++ +;;;;;;;;;; +@read4 5prime tagT with interior substitution within first 3 bp +TTCTTTTTTT ++ +;;;;;;;;;; +@read5 5prime tagT with interior substitution within first 3 bp +TCTTTTTTTT ++ +;;;;;;;;;; +@read6 5prime tagT with 5prime substitution +CTTTTTTTTT ++ +;;;;;;;;;; +@read7 5prime tagT with 5prime and 3prime substitutions +CTTTTTTTTC ++ +;;;;;;;;;; +@read8 5prime tagT with 5prime and interior substitutions +CTTTTTTCTT ++ +;;;;;;;;;; +@read9 5prime tagA with 5prime and interior substitutions; like read3 +AAACAAAAAA ++ +;;;;;;;;;; +@read10 5prime tagG with 5prime and interior substitutions; like read3 +GGGCGGGGGG ++ +;;;;;;;;;; + diff --git a/func_tests/runtests.sh b/func_tests/runtests.sh index 8e676d7..7affa4d 100644 --- a/func_tests/runtests.sh +++ b/func_tests/runtests.sh @@ -94,16 +94,6 @@ checkcmdoutput "cat $test_dir/mx.txt" b95e8b332c8a0a7ffc0f91118e754302 checkcmdoutput "$splitcode --assign --x-only --nFastqs=2 --empty N -x \"0:00:-1,{adapter},{adapter},2:0\" --gzip --mod-names --bclen=20 -t 1 -c $test_dir/config_sm.txt --mapping=/dev/null --pipe $test_dir/smR1.fastq.gz $test_dir/smR2.fastq.gz" 8fc440842f4e2922976cdc1b165008b1 -# Test lift workflow - -checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_1.vcf.gz CAST_EiJ --kmer-length=31 --kmer-output=$test_dir/test.kmers.1.txt" 7e9c1d67efdf7113bfab367cb5d2d640 -checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ --rename --kmer-length=31 --kmer-output=$test_dir/test.kmers.2.txt" c9a91833da19b20383d6bd1d3e32ff8f -checkcmdoutput "cat $test_dir/test.kmers.1.txt" 54d189f4549f6b35ea80ec5c167332b7 -checkcmdoutput "cat $test_dir/test.kmers.2.txt" 9982de087ed358724580836600cc9ba7 -checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ --diploid --kmer-length=31 --kmer-output=$test_dir/test.kmers.2.txt" 6c33bd3ba7cd9aaefeca5cbaa272cdfe -checkcmdoutput "cat $test_dir/test.kmers.2.txt" a4c73b67a0ad6e5094ff7f2dfdda15bc -checkcmdoutput "$splitcode --lift --snv-only $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ" 4bdcaf9f34da45033d477651d8845bf2 -checkcmdoutput "$splitcode --lift --kmer-sj $test_dir/vcf_validation.fa.gz $test_dir/example.SJ.tab --kmer-length=31 --kmer-header=X_ --kmer-header-num" 1e88d0b72323a6c11faa73e19007fc5f # Test from-name, random, and revcomp @@ -239,14 +229,21 @@ checkcmdoutput "$splitcode --trim-only --pipe -q 10 --qtrim-3 -s $test_dir/test_ cmdexec "wc $test_dir/test_summary.txt" checkcmdoutput "$splitcode --trim-only --pipe -q 10 --qtrim-3 --qtrim-pre -5 5 -E ATCG $test_dir/test.fq" 1b5a09bd343382ee78c9aa51245557c2 +# Some fixes (Dec. 2025) + +checkcmdoutput "$splitcode --trim-only --pipe -c $test_dir/config.tsv --loc-names --out-fasta --nFastqs 1 $test_dir/input.fastq" a4508f24c69838b07cec04978def09ca +checkcmdoutput "$splitcode --trim-only --pipe -c $test_dir/config_.tsv --loc-names --out-fasta --nFastqs 1 $test_dir/input_.fastq" 200fb9106e47c741b1891e4b2fedef05 +checkcmdoutput "$splitcode --trim-only --pipe -c $test_dir/config__.tsv --loc-names --out-fasta --nFastqs 1 $test_dir/input__.fastq" 4caa199550f26000ac787821cc62668c +checkcmdoutput "$splitcode --trim-only --pipe -c $test_dir/config___.tsv --loc-names --out-fasta --nFastqs 1 $test_dir/input___.fastq" 209698dea910ed4d4cd05dbce0889800 + # Adapter trimming tests checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=3:0.35 --left=1 --pipe $test_dir/test.fq" b637fbabe71eb90bb9b3399a17eabef7 checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=3:0.34 --left=1 --pipe $test_dir/test.fq" b637fbabe71eb90bb9b3399a17eabef7 checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=3:0.33 --left=1 --pipe $test_dir/test.fq" cb52b79ed7469ca2ffe5739ec544b157 checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=4:0.34 --left=1 --pipe $test_dir/test.fq" cb52b79ed7469ca2ffe5739ec544b157 -checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=2:0.34 --left=1 --pipe $test_dir/test.fq" c6eba12c36e53301f23a9823c2901f24 -checkcmdoutput "$splitcode --trim-only -b CCAAA,CCGGAA --partial5=2:0.34, --partial3=,4 --left=1,0 --right=0,1 --pipe $test_dir/test.fq" 5d4541fb96da328d07ab9189216cf4a5 +checkcmdoutput "$splitcode --trim-only -b CCAAA --partial5=2:0.34 --left=1 --pipe $test_dir/test.fq" b637fbabe71eb90bb9b3399a17eabef7 +checkcmdoutput "$splitcode --trim-only -b CCAAA,CCGGAA --partial5=2:0.34, --partial3=,4 --left=1,0 --right=0,1 --pipe $test_dir/test.fq" 93f1726415edb410d5e733603bc4be11 checkcmdoutput "$splitcode --trim-only -b CCGC -l 0:-4:0 --partial3=4:0.25 --right=1 --pipe $test_dir/test.fq" 11b55a195b5976331305569416db5bd4 checkcmdoutput "$splitcode --trim-only -b CCGG,CCGG -i a,b -l 0:-4:9,1:-4:10 --partial3=2,2 --right=1,1 -N 2 --pipe $test_dir/test.fq $test_dir/test.fq" 6807e3ba911fde8fb437f693d055c11f checkcmdoutput "$splitcode --trim-only -b CCGC,GAAG -a ,{CCGC} -v {GAAG}, -l 0:-4:0, --partial3=4:0.25,3 --partial5=4,3 --right=1,0 --left=0,1 --pipe $test_dir/test.fq" 93f1726415edb410d5e733603bc4be11 @@ -553,4 +550,14 @@ TATTATGGTCCCCCCCCTTCGTGGAATCTAGCTGACTTGTGACTAGCTDGGGGGGGGGG" > $test_dir/test_te checkcmdoutput "$splitcode --assign -m /dev/null --mod-names -g PART,RPM,RPM,ODD,Y -i Part,RTBC2,RTBC4,Odd2Bo1,NYBot1_Stg -b TGACTTG,TTTTTTT,GGGGGGG,TTCGTGGAATCTAGC,TATTATGGT --maxFindsG=Y:1 -p $test_dir/test_term.fq" 79ad1ca9184b274e1396b5f2220cbade +# Test lift workflow + +checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_1.vcf.gz CAST_EiJ --kmer-length=31 --kmer-output=$test_dir/test.kmers.1.txt" 7e9c1d67efdf7113bfab367cb5d2d640 +checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ --rename --kmer-length=31 --kmer-output=$test_dir/test.kmers.2.txt" c9a91833da19b20383d6bd1d3e32ff8f +checkcmdoutput "cat $test_dir/test.kmers.1.txt" 54d189f4549f6b35ea80ec5c167332b7 +checkcmdoutput "cat $test_dir/test.kmers.2.txt" 9982de087ed358724580836600cc9ba7 +checkcmdoutput "$splitcode --lift $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ --diploid --kmer-length=31 --kmer-output=$test_dir/test.kmers.2.txt" 6c33bd3ba7cd9aaefeca5cbaa272cdfe +checkcmdoutput "cat $test_dir/test.kmers.2.txt" a4c73b67a0ad6e5094ff7f2dfdda15bc +checkcmdoutput "$splitcode --lift --snv-only $test_dir/vcf_validation.fa.gz $test_dir/test_2.vcf.gz CAST_EiJ" 4bdcaf9f34da45033d477651d8845bf2 +checkcmdoutput "$splitcode --lift --kmer-sj $test_dir/vcf_validation.fa.gz $test_dir/example.SJ.tab --kmer-length=31 --kmer-header=X_ --kmer-header-num" 1e88d0b72323a6c11faa73e19007fc5f diff --git a/src/SplitCode.h b/src/SplitCode.h index 2b4439b..0c2cc25 100644 --- a/src/SplitCode.h +++ b/src/SplitCode.h @@ -1,7 +1,7 @@ #ifndef SPLITCODE_H #define SPLITCODE_H -#define SPLITCODE_VERSION "0.31.4" +#define SPLITCODE_VERSION "0.31.5" #include #include @@ -2257,7 +2257,7 @@ struct SplitCode { if (updated_name_id != name_id_curr) { return false; // multiple tags of different names } - if (updated_error >= error_prev) { // Choose smallest error first when deciding if to update to larger k + if (true /* updated_error >= error_prev */) { // Edit: Always update to larger k if tags have same name (regardless of mismatch error) updated_tag_id = tag_id_curr; updated_k = curr_k; // Update to larger k updated_error = error_prev; @@ -4979,7 +4979,7 @@ struct SplitCode { std::vector placement_vec; std::string from_header_str; - std::vector> k_expansions; // Keeps track of all possible substring/k-mer lengths for each file (file number is the index) + std::vector> k_expansions; // Keeps track of all possible substring/k-mer lengths for each file (file number is the index) bool init; bool discard_check;