Coretex
local_sequence_sample.py
1 # Copyright (C) 2023 Coretex LLC
2 
3 # This file is part of Coretex.ai
4 
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License as
7 # published by the Free Software Foundation, either version 3 of the
8 # License, or (at your option) any later version.
9 
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU Affero General Public License for more details.
14 
15 # You should have received a copy of the GNU Affero General Public License
16 # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 
18 from typing import List
19 from pathlib import Path
20 from zipfile import ZipFile
21 
22 from ..local_sample import LocalSample
23 from ....utils import file as file_utils
24 
25 
26 def getSequenceFile(directoryPath: Path, extensions: List[str]) -> Path:
27  for path in directoryPath.iterdir():
28  if path.suffix in extensions:
29  return path
30 
31  raise FileNotFoundError(f">> [Coretex] {directoryPath} has no files with extensions \"{extensions}\"")
32 
33 
34 def getForwardSequenceFile(directoryPath: Path, extensions: List[str]) -> Path:
35  for path in directoryPath.iterdir():
36  if "_R1_" in path.name and path.suffix in extensions:
37  return path
38 
39  raise FileNotFoundError(f">> [Coretex] {directoryPath} has no files with \"_R1_\" in name and extensions \"{extensions}\"")
40 
41 
42 def getReverseSequenceFile(directoryPath: Path, extensions: List[str]) -> Path:
43  for path in directoryPath.iterdir():
44  if "_R2_" in path.name and path.suffix in extensions:
45  return path
46 
47  raise FileNotFoundError(f">> [Coretex] {directoryPath} has no files with \"_R2_\" in name and extensions \"{extensions}\"")
48 
49 
51 
52  """
53  Represents the local custom Sample class
54  which is used for working with Other Task locally
55  """
56 
57  @classmethod
58  def supportedExtensions(cls) -> List[str]:
59  return [".fasta", ".fastq", ".fa", ".fq"]
60 
61  @property
62  def sequencePath(self) -> Path:
63  """
64  Returns the path of the .fasta or .fastq sequence file
65  contained inside the sample. If the sample contains gzip compressed
66  sequences, you will have to call Sample.unzip method first otherwise
67  calling Sample.sequencePath will raise an exception
68 
69  Raises
70  ------
71  FileNotFoundError -> if no .fasta, .fastq, .fq, or .fq files are found inside the sample
72  """
73  return getSequenceFile(self.pathpathpath, self.supportedExtensionssupportedExtensions())
74 
75  @property
76  def forwardPath(self) -> Path:
77  """
78  Returns the path of the .fasta or .fastq forward sequence file
79  contained inside the sample. "_R1_" must be present in the filename
80  otherwise it will not be recongnized. If the sample contains gzip compressed
81  sequences, you will have to call Sample.unzip method first otherwise
82  calling Sample.sequencePath will raise an exception
83 
84  Raises
85  ------
86  FileNotFoundError -> if no .fasta, .fastq, .fq, or .fq files are found inside the sample
87  """
88  return getForwardSequenceFile(self.pathpathpath, self.supportedExtensionssupportedExtensions())
89 
90  @property
91  def reversePath(self) -> Path:
92  """
93  Returns the path of the .fasta or .fastq sequence file
94  contained inside the sample. "_R2_" must be present in the filename
95  otherwise it will not be recongnized. If the sample contains gzip compressed
96  sequences, you will have to call Sample.unzip method first otherwise
97  calling Sample.sequencePath will raise an exception
98 
99  Raises
100  ------
101  FileNotFoundError -> if no .fasta, .fastq, .fq, or .fq files are found inside the sample
102  """
103  return getReverseSequenceFile(self.pathpathpath, self.supportedExtensionssupportedExtensions())
104 
105  def unzip(self, ignoreCache: bool = False) -> None:
106  super().unzip(ignoreCache)
107  for extension in self.supportedExtensionssupportedExtensions():
108  for compressedSequencePath in self.pathpathpath.glob(f"*{extension}.gz"):
109  decompressedSequencePath = compressedSequencePath.parent / compressedSequencePath.stem
110 
111  file_utils.gzipDecompress(compressedSequencePath, decompressedSequencePath)
112 
113  def isPairedEnd(self) -> bool:
114  """
115  This function returns True if the sample holds paired-end reads and
116  False if it holds single end. Files for paired-end reads must contain
117  "_R1_" and "_R2_" in their names, otherwise an exception will be raised.
118  If the sample contains gzip compressed sequences, you will have to call
119  Sample.unzip method first otherwise calling Sample.isPairedEnd will
120  raise an exception
121 
122  Raises
123  ------
124  FileNotFoundError -> if no files meeting the requirements for either single-end
125  or paired-end sequencing reads
126  """
127 
128  with ZipFile(self.zipPathzipPathzipPath, "r") as archive:
129  sampleContent = archive.namelist()
130 
131  for extension in self.supportedExtensionssupportedExtensions():
132  fileNames: List[str] = []
133 
134  for fileName in sampleContent:
135  if fileName.endswith(extension) or fileName.endswith(f"{extension}.gz"):
136  fileNames.append(fileName)
137 
138  if len(fileNames) == 0:
139  continue
140 
141  if len(fileNames) == 1:
142  return False
143 
144  if len(fileNames) == 2:
145  forwardPresent = any(["_R1_" in fileName for fileName in fileNames])
146  reversePresent = any(["_R2_" in fileName for fileName in fileNames])
147 
148  if forwardPresent and reversePresent:
149  return True
150 
151  raise FileNotFoundError(f">> [Coretex] Invalid sequence sample \"{self.name}\". Could not determine sequencing type")