18 from typing
import List, Tuple
19 from pathlib
import Path
24 from ...utils
import command, logProcessOutput, CommandException
25 from ...entities
import CustomDataset
26 from ...logging
import LogSeverity
29 def indexCommand(bwaPath: Path, sequencePath: Path, prefix: Path) ->
None:
31 This function acts as a wrapper for the index command of BWA
32 (Burrows-Wheeler Aligner). It is necessary to perform indexing
33 on the reference sequence before alignment.
38 Path pointing to the bwa binary
40 Path pointing to the reference sequence
42 Path serving as the prefix for indexing. This command will
43 generate 5 new files that are needed for alignment. These files will
44 have the path "{prefix}.{suffix}", each with a different suffix / extension
48 >>> from pathlib import Path
49 >>> bwaPath = Path("tools/bwa")
50 >>> sequencePath = Path("sequences/hg18.fa")
51 >>> prefix = Path("output/hg18")
52 >>> indexCommand(bwaPath, sequencePath, prefix)
54 Link to BWA: https://bio-bwa.sourceforge.net
58 str(bwaPath.absolute()),
"index",
59 "-p", str(prefix.absolute()),
60 str(sequencePath.absolute())
69 multithreading: bool =
True
73 This function acts as a wrapper for the mem command of BWA
74 (Burrows-Wheeler Aligner). It perfoms alignment of a given sequence read
75 to the reference sequence and outputs a SAM file.
80 Path pointing to the bwa binary
82 Path that serves as the prefix for the 5 files generated during indexing
84 Path pointing to the sequence read file on which alignment should be performed
86 Path of the output file
90 >>> from pathlib import Path
91 >>> bwaPath = Path("tools/bwa")
92 >>> prefix = Path("reference/hg18")
93 >>> sequencePath = Path("input/R34D.fastq")
94 >>> outputPath = Path("output/R34D.sam")
95 >>> alignCommand(bwaPath, prefix, sequencePath, outputPath)
97 Link to BWA: https://bio-bwa.sourceforge.net
101 str(bwaPath.absolute()),
"mem",
102 "-o", str(outputPath.absolute())
106 threads = os.cpu_count()
107 if threads
is not None:
108 args.extend([
"-t", str(threads)])
111 str(prefix.absolute()),
112 str(sequencePath.absolute())
122 multithreading: bool =
True
126 This function uses the CLI tool "samtools" to convert SAM files into their binary
132 Path pointing to the samtools binary
134 Path pointing to the input .sam file
136 Path pointing to the output, .bam, file
140 >>> from pathlib import Path
141 >>> samtoolsPath = Path("tools/samtools")
142 >>> samPath = Path("input/R34D.sam")
143 >>> outputPath = Path("output/R34D.bam")
144 >>> sam2bamCommand(samtoolsPath, samPath, outputPath)
146 Link to samtools: http://htslib.org/
150 str(samtoolsPath.absolute()),
"view",
155 threads = os.cpu_count()
156 if threads
is not None:
157 args.extend([
"--threads", str(threads - 1)])
160 "-o", str(outputPath.absolute()),
161 str(samPath.absolute())
167 def extractData(samtoolsPath: Path, file: Path) -> Tuple[List[int], List[int], List[int]]:
169 Takes an aligned sequence file (SAM/BAM) and returns three lists holding
170 MAPQ scores, leftmost position and sequence length for each read in the
171 file. This is done using the samtools view command.
176 Path pointing to the samtools binary
178 Path pointing to the input .sam or .bam file
182 >>> from pathlib import Path
183 >>> samtoolsPath = Path("tools/samtools")
184 >>> file = Path("R34D.bam")
185 >>> scores, positions, lengths = extractData(samtoolsPath, file)
187 Link to samtools: http://htslib.org/
190 scores: List[int] = []
191 positions: List[int] = []
192 sequenceLengths: List[int] = []
195 str(samtoolsPath.absolute()),
197 "-F",
"256",
"-F",
"2048",
"-F",
"512",
201 process = subprocess.Popen(
204 cwd = Path(__file__).parent,
205 stdout = subprocess.PIPE,
206 stderr = subprocess.PIPE
209 while process.poll()
is None:
210 if process.stdout
is None:
213 stdout = process.stdout.readlines()
217 fields = line.decode(
"UTF-8").strip().split(
"\t")
218 scores.append(int(fields[4]))
219 positions.append(int(fields[3]))
220 sequenceLengths.append(len(fields[9]))
222 if process.stderr
is not None:
223 for stderr
in process.stderr:
225 if process.returncode == 0:
226 logProcessOutput(stderr, LogSeverity.warning)
228 logProcessOutput(stderr, LogSeverity.fatal)
230 if process.returncode != 0:
231 raise CommandException(f
">> [Coretex] Falied to execute command. Returncode: {process.returncode}")
233 return scores, positions, sequenceLengths
236 def chmodX(file: Path) ->
None:
237 command([
"chmod",
"+x", str(file.absolute())])
240 def loadFa(dataset: CustomDataset) -> List[Path]:
241 inputFiles: List[Path] = []
243 for sample
in dataset.samples:
246 for file
in Path(sample.path).iterdir():
247 if file.suffix.startswith(
".fa"):
248 inputFiles.append(file)
250 if len(inputFiles) == 0:
251 raise ValueError(
">> [Coretex] No sequence reads found")