Coretex
sequence_dataset.py
1 # Copyright (C) 2023 Coretex LLC
2 
3 # This file is part of Coretex.ai
4 
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License as
7 # published by the Free Software Foundation, either version 3 of the
8 # License, or (at your option) any later version.
9 
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU Affero General Public License for more details.
14 
15 # You should have received a copy of the GNU Affero General Public License
16 # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 
18 from typing import Dict, Optional, Any, Union
19 from typing_extensions import Self
20 from pathlib import Path
21 
22 import logging
23 
24 from .base import BaseSequenceDataset
25 from ..network_dataset import NetworkDataset, _chunkSampleImport, _encryptedSampleImport
26 from ...sample import SequenceSample, CustomSample
27 from ...._folder_manager import folder_manager
28 from ....codable import KeyDescriptor
29 from ....cryptography import getProjectKey
30 from ....utils import file as file_utils
31 
32 
33 class SequenceDataset(BaseSequenceDataset, NetworkDataset[SequenceSample]):
34 
35  """
36  Sequence Dataset class which is used for Datasets whose
37  samples contain sequence data (.fasta, .fastq)
38  """
39 
40  metadata: CustomSample
41 
42  def __init__(self) -> None:
43  super().__init__(SequenceSample)
44 
45  @classmethod
46  def _keyDescriptors(cls) -> Dict[str, KeyDescriptor]:
47  descriptors = super()._keyDescriptors()
48  descriptors["samples"] = KeyDescriptor("sessions", SequenceSample, list)
49 
50  return descriptors
51 
52  def onDecode(self) -> None:
53  metadataSample = self.getSamplegetSample("_metadata")
54  if metadataSample is None:
55  raise FileNotFoundError(">> [Coretex] _metadata sample could not be found in the dataset")
56 
57  self.metadatametadata = CustomSample.decode(metadataSample.encode())
58  self.samplessamples = [
59  sample for sample in self.samplessamples
60  if sample.id != self.metadatametadata.id
61  ]
62 
63  @classmethod
65  cls,
66  name: str,
67  projectId: int,
68  metadataPath: Union[Path, str],
69  meta: Optional[Dict[str, Any]] = None
70  ) -> Optional[Self]:
71 
72  """
73  Creates a new sequence dataset with the provided name and metadata
74 
75  Parameters
76  ----------
77  name : str
78  dataset name
79  projectId : int
80  project for which the dataset will be created
81  metadataPath : Union[Path, str]
82  path the zipped metadata file
83 
84  Returns
85  -------
86  The created sequence dataset object or None if creation failed
87 
88  Example
89  -------
90  >>> from coretex import SequenceDataset
91  \b
92  >>> dummyDataset = SequenceDataset.createSequenceDataset("dummyDataset", 123, pathToMetadata)
93  >>> if dummyDataset is not None:
94  print("Dataset created successfully")
95  """
96 
97  if isinstance(metadataPath, str):
98  metadataPath = Path(metadataPath)
99 
100  dataset = cls.createDatasetcreateDataset(name, projectId, meta)
101 
102  if dataset.isEncrypted:
103  dataset.metadata = _encryptedSampleImport(CustomSample, "_metadata", metadataPath, dataset.id, getProjectKey(dataset.projectId))
104  else:
105  dataset.metadata = _chunkSampleImport(CustomSample, "_metadata", metadataPath, dataset.id)
106 
107  return dataset
108 
109  def download(self, decrypt: bool = True, ignoreCache: bool = False) -> None:
110  super().download(decrypt, ignoreCache)
111 
112  self.metadatametadata.download(decrypt, ignoreCache)
113 
114  def isPairedEnd(self) -> bool:
115  """
116  This function returns True if the dataset holds paired-end reads and
117  False if it holds single end. Files for paired-end reads must contain
118  "_R1_" and "_R2_" in their names, otherwise an exception will be raised.
119  If the sample contains gzip compressed sequences, you will have to call
120  Sample.unzip method first otherwise calling Sample.isPairedEnd will
121  raise an exception
122 
123  Raises
124  ------
125  FileNotFoundError -> if no files meeting the requirements for either single-end
126  or paired-end sequencing reads
127  ValueError -> if dataset has a combination of single-end and paired-end samples
128  """
129 
130  pairedEndSamples = [sample.isPairedEnd() for sample in self.samplessamples]
131 
132  if all(pairedEndSamples):
133  return True
134 
135  if not any(pairedEndSamples):
136  return False
137 
138  raise ValueError(">> [Coretex] Dataset contains a mix of paired-end and single-end sequences. It should contain either one or the other")
139 
140  def _uploadSample(self, samplePath: Path, sampleName: str, **metadata: Any) -> SequenceSample:
141  if not self._sampleType_sampleType.isValidSequenceFile(samplePath):
142  raise ValueError(f"\"{samplePath}\" is not a valid sequence")
143 
144  if file_utils.isArchive(samplePath):
145  sample = _chunkSampleImport(self._sampleType_sampleType, sampleName, samplePath, self.id)
146  else:
147  with folder_manager.tempFile() as archivePath:
148  logging.getLogger("coretexpylib").info(f">> [Coretex] Provided Sample \"{samplePath}\" is not an archive, zipping...")
149  file_utils.archive(samplePath, archivePath)
150 
151  sample = _chunkSampleImport(self._sampleType_sampleType, sampleName, archivePath, self.id)
152 
153  return sample
Optional[SampleType] getSample(self, str name)
Definition: dataset.py:90
Self createDataset(cls, str name, int projectId, Optional[Dict[str, Any]] meta=None)
None download(self, bool decrypt=True, bool ignoreCache=False)
Optional[Self] createSequenceDataset(cls, str name, int projectId, Union[Path, str] metadataPath, Optional[Dict[str, Any]] meta=None)