Coretex
__init__.py
1 # Copyright (C) 2023 Coretex LLC
2 
3 # This file is part of Coretex.ai
4 
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License as
7 # published by the Free Software Foundation, either version 3 of the
8 # License, or (at your option) any later version.
9 
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU Affero General Public License for more details.
14 
15 # You should have received a copy of the GNU Affero General Public License
16 # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 
18 from typing import Optional, Union, List
19 from pathlib import Path
20 
21 from ..utils import command
22 from ..entities import CustomDataset
23 
24 
25 def cutadaptTrim(
26  forwardFile: Union[str, Path],
27  forwardOutput: Union[str, Path],
28  forwardAdapter: str,
29  reverseFile: Optional[Union[str, Path]] = None,
30  reverseOutput: Optional[Union[str, Path]] = None,
31  reverseAdapter: Optional[str] = None
32 ) -> None:
33  """
34  Used to trim adapter sequences from sigle-end and paired-end sequencing reads
35 
36  Parameters
37  ----------
38  forwardFile : str
39  Path to the file holding forward sequences
40  forwardOutput : str
41  Path to the output file for forward sequences
42  forwardAdapter : str
43  The adapter sequence for the forward reads
44  reverseFile : Optional[str]
45  Path to the file holding reverse sequences (pass for paired-end reads,
46  otherwise only forward is required for single-end)
47  reverseOutput : Optional[str]
48  Path to the output file for reverse sequences (pass for paired-end reads,
49  otherwise only forward is required for single-end)
50  reverseAdapter : Optional[str]
51  The adapter sequence for the reverse reads (pass for paired-end reads,
52  otherwise only forward is required for single-end)
53  """
54 
55  if isinstance(forwardFile, Path):
56  forwardFile = str(forwardFile)
57 
58  if isinstance(forwardOutput, Path):
59  forwardOutput = str(forwardOutput)
60 
61  if isinstance(reverseFile, Path):
62  reverseFile = str(reverseFile)
63 
64  if isinstance(reverseOutput, Path):
65  reverseOutput = str(reverseOutput)
66 
67  args: List[str] = [
68  "cutadapt",
69  "-o", forwardOutput,
70  "-g", forwardAdapter,
71  ]
72 
73  if reverseOutput is not None and reverseAdapter is not None:
74  args.extend([
75  "-p", reverseOutput,
76  "-G", reverseAdapter
77  ])
78 
79  args.append(forwardFile)
80  if reverseFile is not None:
81  args.append(reverseFile)
82 
83  command(args)
84 
85 
86 def isPairedEnd(dataset: CustomDataset) -> bool:
87  """
88  Check to see if the dataset has paired-end sequences, i.e. two fastq files
89  per sample (excluding the metadata file)
90 
91  Parameters
92  ----------
93  dataset : CustomDataset
94  Coretex dataset that will be checked for paired-end sequences
95 
96  Returns
97  -------
98  bool -> True if paired-end, False otherwise
99  """
100 
101  for sample in dataset.samples:
102  sample.unzip()
103 
104  if sample.name.startswith("_metadata"):
105  continue
106 
107  if len(list(sample.path.glob("*.fastq*"))) != 2:
108  return False
109 
110  return True