18 from typing
import Dict, Optional, Any, Union
19 from typing_extensions
import Self
20 from pathlib
import Path
24 from .base
import BaseSequenceDataset
25 from ..network_dataset
import NetworkDataset, _chunkSampleImport, _encryptedSampleImport
26 from ...sample
import SequenceSample, CustomSample
27 from ...._folder_manager
import folder_manager
28 from ....codable
import KeyDescriptor
29 from ....cryptography
import getProjectKey
30 from ....utils
import file
as file_utils
36 Sequence Dataset class which is used for Datasets whose
37 samples contain sequence data (.fasta, .fastq)
40 metadata: CustomSample
42 def __init__(self) -> None:
43 super().__init__(SequenceSample)
46 def _keyDescriptors(cls) -> Dict[str, KeyDescriptor]:
47 descriptors = super()._keyDescriptors()
48 descriptors[
"samples"] =
KeyDescriptor(
"sessions", SequenceSample, list)
52 def onDecode(self) -> None:
53 metadataSample = self.
getSamplegetSample(
"_metadata")
54 if metadataSample
is None:
55 raise FileNotFoundError(
">> [Coretex] _metadata sample could not be found in the dataset")
57 self.
metadatametadata = CustomSample.decode(metadataSample.encode())
59 sample
for sample
in self.
samplessamples
60 if sample.id != self.
metadatametadata.id
68 metadataPath: Union[Path, str],
69 meta: Optional[Dict[str, Any]] =
None
73 Creates a new sequence dataset with the provided name and metadata
80 project for which the dataset will be created
81 metadataPath : Union[Path, str]
82 path the zipped metadata file
86 The created sequence dataset object or None if creation failed
90 >>> from coretex import SequenceDataset
92 >>> dummyDataset = SequenceDataset.createSequenceDataset("dummyDataset", 123, pathToMetadata)
93 >>> if dummyDataset is not None:
94 print("Dataset created successfully")
97 if isinstance(metadataPath, str):
98 metadataPath = Path(metadataPath)
100 dataset = cls.
createDatasetcreateDataset(name, projectId, meta)
102 if dataset.isEncrypted:
103 dataset.metadata = _encryptedSampleImport(CustomSample,
"_metadata", metadataPath, dataset.id, getProjectKey(dataset.projectId))
105 dataset.metadata = _chunkSampleImport(CustomSample,
"_metadata", metadataPath, dataset.id)
109 def download(self, decrypt: bool =
True, ignoreCache: bool =
False) ->
None:
110 super().
download(decrypt, ignoreCache)
116 This function returns True if the dataset holds paired-end reads and
117 False if it holds single end. Files for paired-end reads must contain
118 "_R1_" and "_R2_" in their names, otherwise an exception will be raised.
119 If the sample contains gzip compressed sequences, you will have to call
120 Sample.unzip method first otherwise calling Sample.isPairedEnd will
125 FileNotFoundError -> if no files meeting the requirements for either single-end
126 or paired-end sequencing reads
127 ValueError -> if dataset has a combination of single-end and paired-end samples
130 pairedEndSamples = [sample.isPairedEnd()
for sample
in self.
samplessamples]
132 if all(pairedEndSamples):
135 if not any(pairedEndSamples):
138 raise ValueError(
">> [Coretex] Dataset contains a mix of paired-end and single-end sequences. It should contain either one or the other")
140 def _uploadSample(self, samplePath: Path, sampleName: str, **metadata: Any) -> SequenceSample:
141 if not self.
_sampleType_sampleType.isValidSequenceFile(samplePath):
142 raise ValueError(f
"\"{samplePath}\" is not a valid sequence")
144 if file_utils.isArchive(samplePath):
145 sample = _chunkSampleImport(self.
_sampleType_sampleType, sampleName, samplePath, self.id)
147 with folder_manager.tempFile()
as archivePath:
148 logging.getLogger(
"coretexpylib").info(f
">> [Coretex] Provided Sample \"{samplePath}\" is not an archive, zipping...")
149 file_utils.archive(samplePath, archivePath)
151 sample = _chunkSampleImport(self.
_sampleType_sampleType, sampleName, archivePath, self.id)
Optional[SampleType] getSample(self, str name)
Self createDataset(cls, str name, int projectId, Optional[Dict[str, Any]] meta=None)
None download(self, bool decrypt=True, bool ignoreCache=False)
Optional[Self] createSequenceDataset(cls, str name, int projectId, Union[Path, str] metadataPath, Optional[Dict[str, Any]] meta=None)