18 from typing
import List
19 from pathlib
import Path
20 from zipfile
import ZipFile
22 from ..local_sample
import LocalSample
23 from ....utils
import file
as file_utils
26 def getSequenceFile(directoryPath: Path, extensions: List[str]) -> Path:
27 for path
in directoryPath.iterdir():
28 if path.suffix
in extensions:
31 raise FileNotFoundError(f
">> [Coretex] {directoryPath} has no files with extensions \"{extensions}\"")
34 def getForwardSequenceFile(directoryPath: Path, extensions: List[str]) -> Path:
35 for path
in directoryPath.iterdir():
36 if "_R1_" in path.name
and path.suffix
in extensions:
39 raise FileNotFoundError(f
">> [Coretex] {directoryPath} has no files with \"_R1_\" in name and extensions \"{extensions}\"")
42 def getReverseSequenceFile(directoryPath: Path, extensions: List[str]) -> Path:
43 for path
in directoryPath.iterdir():
44 if "_R2_" in path.name
and path.suffix
in extensions:
47 raise FileNotFoundError(f
">> [Coretex] {directoryPath} has no files with \"_R2_\" in name and extensions \"{extensions}\"")
53 Represents the local custom Sample class
54 which is used for working with Other Task locally
58 def supportedExtensions(cls) -> List[str]:
59 return [
".fasta",
".fastq",
".fa",
".fq"]
64 Returns the path of the .fasta or .fastq sequence file
65 contained inside the sample. If the sample contains gzip compressed
66 sequences, you will have to call Sample.unzip method first otherwise
67 calling Sample.sequencePath will raise an exception
71 FileNotFoundError -> if no .fasta, .fastq, .fq, or .fq files are found inside the sample
78 Returns the path of the .fasta or .fastq forward sequence file
79 contained inside the sample. "_R1_" must be present in the filename
80 otherwise it will not be recongnized. If the sample contains gzip compressed
81 sequences, you will have to call Sample.unzip method first otherwise
82 calling Sample.sequencePath will raise an exception
86 FileNotFoundError -> if no .fasta, .fastq, .fq, or .fq files are found inside the sample
93 Returns the path of the .fasta or .fastq sequence file
94 contained inside the sample. "_R2_" must be present in the filename
95 otherwise it will not be recongnized. If the sample contains gzip compressed
96 sequences, you will have to call Sample.unzip method first otherwise
97 calling Sample.sequencePath will raise an exception
101 FileNotFoundError -> if no .fasta, .fastq, .fq, or .fq files are found inside the sample
105 def unzip(self, ignoreCache: bool =
False) ->
None:
106 super().
unzip(ignoreCache)
108 for compressedSequencePath
in self.
pathpathpath.glob(f
"*{extension}.gz"):
109 decompressedSequencePath = compressedSequencePath.parent / compressedSequencePath.stem
111 file_utils.gzipDecompress(compressedSequencePath, decompressedSequencePath)
115 This function returns True if the sample holds paired-end reads and
116 False if it holds single end. Files for paired-end reads must contain
117 "_R1_" and "_R2_" in their names, otherwise an exception will be raised.
118 If the sample contains gzip compressed sequences, you will have to call
119 Sample.unzip method first otherwise calling Sample.isPairedEnd will
124 FileNotFoundError -> if no files meeting the requirements for either single-end
125 or paired-end sequencing reads
129 sampleContent = archive.namelist()
132 fileNames: List[str] = []
134 for fileName
in sampleContent:
135 if fileName.endswith(extension)
or fileName.endswith(f
"{extension}.gz"):
136 fileNames.append(fileName)
138 if len(fileNames) == 0:
141 if len(fileNames) == 1:
144 if len(fileNames) == 2:
145 forwardPresent = any([
"_R1_" in fileName
for fileName
in fileNames])
146 reversePresent = any([
"_R2_" in fileName
for fileName
in fileNames])
148 if forwardPresent
and reversePresent:
151 raise FileNotFoundError(f
">> [Coretex] Invalid sequence sample \"{self.name}\". Could not determine sequencing type")
None unzip(self, bool ignoreCache=False)
List[str] supportedExtensions(cls)