Coretex
__init__.py
1 # Copyright (C) 2023 Coretex LLC
2 
3 # This file is part of Coretex.ai
4 
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License as
7 # published by the Free Software Foundation, either version 3 of the
8 # License, or (at your option) any later version.
9 
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU Affero General Public License for more details.
14 
15 # You should have received a copy of the GNU Affero General Public License
16 # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 
18 from typing import Optional, Union
19 from pathlib import Path
20 
21 import os
22 
23 from .utils import compressGzip, createSample, getDemuxSamples, getDenoisedSamples, \
24  getFastqDPSamples, getFastqMPSamples, getImportedSamples, getMetadata, getPhylogeneticTreeSamples, \
25  isDemultiplexedSample, isDenoisedSample, isFastqDPSample, isFastqMPSample, \
26  isImportedSample, isPhylogeneticTreeSample, sampleNumber, isPairedEnd
27 
28 from ...utils import command
29 
30 
31 def toolsImport(
32  sequenceType: str,
33  inputPath: str,
34  outputPath: str,
35  inputFormat: Optional[str] = None
36 ) -> None:
37 
38  args = [
39  "qiime", "tools", "import",
40  "--type", sequenceType,
41  "--input-path", str(inputPath),
42  "--output-path", str(outputPath)
43  ]
44 
45  if inputFormat is not None:
46  args.extend([
47  "--input-format" , inputFormat
48  ])
49 
50  command(args)
51 
52 
53 def demuxEmpSingle(
54  sequencesPath: str,
55  barcodesPath: str,
56  barcodesColumn: str,
57  perSampleSequences: str,
58  errorCorretctionDetailsPath: str
59 ) -> None:
60 
61  command([
62  "qiime", "demux", "emp-single",
63  "--i-seqs", sequencesPath,
64  "--m-barcodes-file", barcodesPath,
65  "--m-barcodes-column", barcodesColumn,
66  "--o-per-sample-sequences", perSampleSequences,
67  "--o-error-correction-details", errorCorretctionDetailsPath,
68  "--verbose"
69  ])
70 
71 
72 def demuxEmpPaired(
73  sequencesPath: Union[str, Path],
74  barcodesPath: Union[str, Path],
75  barcodesColumn: str,
76  perSampleSequences: Union[str, Path],
77  errorCorretctionDetailsPath: Union[str, Path]
78 ) -> None:
79  """
80  Wrapper for QIIME's demux emp-paired command, which performs demultiplexing of multiplexed
81  sequences in the Earth Microbiome Project amplicon sequencing standard format
82  http://www.earthmicrobiome.org/protocols-and-standards/.
83 
84  Parameters
85  ----------
86  sequencesPath : Union[str, Path]
87  Path to the paired-end demultiplexed sequences to be denoised
88  barcodesPath : Union[str, Path]
89  Output path to the resulting feature sequences
90  perSampleSequences : Union[str, Path]
91  Output path to the resulting feature table
92  errorCorretctionDetailsPath : Union[str, Path]
93  Output path to the statistics of the denoising
94  """
95 
96  if isinstance(sequencesPath, Path):
97  sequencesPath = str(sequencesPath)
98 
99  if isinstance(barcodesPath, Path):
100  barcodesPath = str(barcodesPath)
101 
102  if isinstance(perSampleSequences, Path):
103  perSampleSequences = str(perSampleSequences)
104 
105  if isinstance(errorCorretctionDetailsPath, Path):
106  errorCorretctionDetailsPath = str(errorCorretctionDetailsPath)
107 
108  command([
109  "qiime", "demux", "emp-paired",
110  "--i-seqs", sequencesPath,
111  "--m-barcodes-file", barcodesPath,
112  "--m-barcodes-column", barcodesColumn,
113  "--o-per-sample-sequences", perSampleSequences,
114  "--o-error-correction-details", errorCorretctionDetailsPath,
115  "--verbose"
116  ])
117 
118 
119 def demuxSummarize(dataPath: str, visualizationPath: str) -> None:
120  command([
121  "qiime", "demux", "summarize",
122  "--i-data", dataPath,
123  "--o-visualization", visualizationPath
124  ])
125 
126 
127 def dada2DenoiseSingle(
128  inputPath: Union[str, Path],
129  trimLeft: int,
130  truncLen: int,
131  representativeSequencesPath: Union[str, Path],
132  tablePath: Union[str, Path],
133  denoisingStatsPath: Union[str, Path],
134  threads: Optional[int] = -1
135 ) -> None:
136  """
137  Wrapper for QIIME's DADA2 denoise-single, which performs denoising on the input paired-end reads.
138 
139  Parameters
140  ----------
141  inputPath : Union[str, Path]
142  Path to the paired-end demultiplexed sequences to be denoised
143  trimLeft : int
144  Position at which read sequences should be trimmed due to low quality.
145  This trims the 5' end of the input sequences, which will be the bases that were
146  sequenced in the first cycles
147  truncLen : int
148  Position at which read sequences should be truncated due to decrease in quality.
149  This truncates the 3' end of the input sequences, which will be the bases that
150  were sequenced in the last cycles. Reads that are shorter than this value will be discarded.
151  After this parameter is applied there must still be at least a 20 nucleotide overlap
152  between the forward and reverse reads. If 0 is provided, no truncation or length
153  filtering will be performed
154  representativeSequencesPath : Union[str, Path]
155  Output path to the resulting feature sequences. Each feature in the feature table will
156  be represented by exactly one sequence, and these sequences will be the
157  joined paired-end sequences
158  tablePath : Union[str, Path]
159  Output path to the resulting feature table
160  denoisingStatsPath : Union[str, Path]
161  Output path to the statistics of the denoising
162  threads : Optional[int]
163  Number of threads to use for the process. If None, multithreading will not be used.
164  Set to -1 (defaul) to use a thread for each CPU core
165  """
166 
167  if isinstance(inputPath, Path):
168  inputPath = str(inputPath)
169 
170  if isinstance(representativeSequencesPath, Path):
171  representativeSequencesPath = str(representativeSequencesPath)
172 
173  if isinstance(tablePath, Path):
174  tablePath = str(tablePath)
175 
176  if isinstance(denoisingStatsPath, Path):
177  denoisingStatsPath = str(denoisingStatsPath)
178 
179  args = [
180  "qiime", "dada2", "denoise-single",
181  "--i-demultiplexed-seqs", inputPath,
182  "--p-trim-left", str(trimLeft),
183  "--p-trunc-len", str(truncLen),
184  "--o-representative-sequences", representativeSequencesPath,
185  "--o-table", tablePath,
186  "--o-denoising-stats", denoisingStatsPath
187  ]
188 
189  if threads == -1:
190  args.extend(["--p-n-threads", "0"])
191  elif threads is not None:
192  args.extend(["--p-n-threads", str()])
193 
194  command(args)
195 
196 
197 def dada2DenoisePaired(
198  inputPath: Union[str, Path],
199  trimLeftF: int,
200  trimLeftR: int,
201  truncLenF: int,
202  truncLenR: int,
203  representativeSequencesPath: Union[str, Path],
204  tablePath: Union[str, Path],
205  denoisingStatsPath: Union[str, Path],
206  threads: Optional[int] = -1
207 ) -> None:
208  """
209  Wrapper for QIIME's DADA2 denoise-paired, which performs denoising on the input paired-end reads.
210 
211  Parameters
212  ----------
213  inputPath : Union[str, Path]
214  Path to the paired-end demultiplexed sequences to be denoised
215  trimLeftF : int
216  Position at which forward read sequences should be trimmed due to low quality.
217  This trims the 5' end of the input sequences, which will be the bases that were
218  sequenced in the first cycles
219  trimLeftR : int
220  Position at which reverse read sequences should be trimmed due to low quality.
221  This trims the 5' end of the input sequences, which will be the bases that were
222  sequenced in the first cycles
223  truncLenF : int
224  Position at which forward read sequences should be truncated due to decrease in quality.
225  This truncates the 3' end of the input sequences, which will be the bases that
226  were sequenced in the last cycles. Reads that are shorter than this value will be discarded.
227  After this parameter is applied there must still be at least a 20 nucleotide overlap
228  between the forward and reverse reads. If 0 is provided, no truncation or length
229  filtering will be performed
230  truncLenR : int
231  Position at which reverse read sequences should be truncated due to decrease in quality.
232  This truncates the 3' end of the input sequences, which will be the bases that
233  were sequenced in the last cycles. Reads that are shorter than this value will be discarded.
234  After this parameter is applied there must still be at least a 20 nucleotide overlap
235  between the forward and reverse reads. If 0 is provided, no truncation or length
236  filtering will be performed
237  representativeSequencesPath : Union[str, Path]
238  Output path to the resulting feature sequences. Each feature in the feature table will
239  be represented by exactly one sequence, and these sequences will be the
240  joined paired-end sequences
241  tablePath : Union[str, Path]
242  Output path to the resulting feature table
243  denoisingStatsPath : Union[str, Path]
244  Output path to the statistics of the denoising
245  threads : Optional[int]
246  Number of threads to use for the process. If None, multithreading will not be used.
247  Set to -1 (defaul) to use a thread for each CPU core
248  """
249 
250  if isinstance(inputPath, Path):
251  inputPath = str(inputPath)
252 
253  if isinstance(representativeSequencesPath, Path):
254  representativeSequencesPath = str(representativeSequencesPath)
255 
256  if isinstance(tablePath, Path):
257  tablePath = str(tablePath)
258 
259  if isinstance(denoisingStatsPath, Path):
260  denoisingStatsPath = str(denoisingStatsPath)
261 
262  args = [
263  "qiime", "dada2", "denoise-paired",
264  "--i-demultiplexed-seqs", inputPath,
265  "--p-trim-left-f", str(trimLeftF),
266  "--p-trim-left-r", str(trimLeftR),
267  "--p-trunc-len-f", str(truncLenF),
268  "--p-trunc-len-r", str(truncLenR),
269  "--o-representative-sequences", representativeSequencesPath,
270  "--o-table", tablePath,
271  "--o-denoising-stats", denoisingStatsPath
272  ]
273 
274  if threads == -1:
275  args.extend(["--p-n-threads", "0"])
276  elif threads is not None:
277  args.extend(["--p-n-threads", str(threads)])
278 
279  command(args)
280 
281 
282 def metadataTabulate(inputFile: str, visualizationPath: str) -> None:
283  command([
284  "qiime", "metadata", "tabulate",
285  "--m-input-file", inputFile,
286  "--o-visualization", visualizationPath
287  ])
288 
289 
290 def featureTableSummarize(inputPath: str, visualizationPath: str, metadataPath: str) -> None:
291  command([
292  "qiime", "feature-table", "summarize",
293  "--i-table", inputPath,
294  "--o-visualization", visualizationPath,
295  "--m-sample-metadata-file", metadataPath
296  ])
297 
298 
299 def featureTableTabulateSeqs(inputPath: str, visualizationPath: str) -> None:
300  command([
301  "qiime", "feature-table", "tabulate-seqs",
302  "--i-data", inputPath,
303  "--o-visualization", visualizationPath
304  ])
305 
306 
307 def phylogenyAlignToTreeMafftFasttree(
308  sequencesPath: str,
309  aligmentPath: str,
310  maskedAligmentPath: str,
311  unrootedTreePath: str,
312  rootedTreePath: str,
313  threads: Optional[int] = -1
314 ) -> None:
315 
316  args = [
317  "qiime", "phylogeny", "align-to-tree-mafft-fasttree",
318  "--i-sequences", sequencesPath,
319  "--o-alignment", aligmentPath,
320  "--o-masked-alignment", maskedAligmentPath,
321  "--o-tree", unrootedTreePath,
322  "--o-rooted-tree", rootedTreePath
323  ]
324 
325  if threads == -1:
326  args.extend(["--p-n-threads", "auto"])
327  elif threads is not None:
328  args.extend(["--p-n-threads", str(threads)])
329 
330  command(args)
331 
332 
333 def diversityCoreMetricsPhylogenetic(
334  phlogenyPath: str,
335  tablePath: str,
336  samplingDepth: int,
337  metadataPath: str,
338  outputDir: str,
339  threads: Optional[int] = -1
340 ) -> None:
341 
342  args = [
343  "qiime", "diversity", "core-metrics-phylogenetic",
344  "--i-phylogeny", phlogenyPath,
345  "--i-table", tablePath,
346  "--p-sampling-depth", str(samplingDepth),
347  "--m-metadata-file", metadataPath,
348  "--output-dir", outputDir
349  ]
350 
351  if threads == -1:
352  args.extend(["--p-n-jobs-or-threads", "auto"])
353  elif threads is not None:
354  args.extend(["--p-n-jobs-or-threads", str(threads)])
355 
356  command(args)
357 
358 
359 def diversityAlphaGroupSignificance(
360  alphaDiversityPath: str,
361  metadataPath: str,
362  visualizationPath: str
363 ) -> None:
364 
365  command([
366  "qiime", "diversity", "alpha-group-significance",
367  "--i-alpha-diversity", alphaDiversityPath,
368  "--m-metadata-file", metadataPath,
369  "--o-visualization", visualizationPath
370  ])
371 
372 
373 def diversityBetaGroupSignificance(
374  distanceMatrixPath: str,
375  metadataPath: str,
376  metadataColumn: str,
377  visualizationPath: str,
378  pairwise: bool
379 ) -> None:
380 
381  command([
382  "qiime", "diversity", "beta-group-significance",
383  "--i-distance-matrix", distanceMatrixPath,
384  "--m-metadata-file", metadataPath,
385  "--m-metadata-column", metadataColumn,
386  "--o-visualization", visualizationPath,
387  "--p-pairwise" if pairwise else ""
388  ])
389 
390 
391 def emperorPlot(
392  pcoaPath: str,
393  metadataPath: str,
394  customAxes: str,
395  visualizationPath: str
396 ) -> None:
397 
398  command([
399  "qiime", "emperor", "plot",
400  "--i-pcoa", pcoaPath,
401  "--m-metadata-file", metadataPath,
402  "--p-custom-axes", customAxes,
403  "--o-visualization", visualizationPath
404  ])
405 
406 
407 def diversityAlphaRarefaction(
408  tablePath: str,
409  phylogenyPath: str,
410  maxDepth: int,
411  metadataPath: str,
412  visualizationPath: str
413 ) -> None:
414 
415  command([
416  "qiime", "diversity", "alpha-rarefaction",
417  "--i-table", tablePath,
418  "--i-phylogeny", phylogenyPath,
419  "--p-max-depth", str(maxDepth),
420  "--m-metadata-file", metadataPath,
421  "--o-visualization", visualizationPath
422  ])
423 
424 
425 def featureClassifierClassifySklearn(
426  classifierPath: str,
427  readsPath: str,
428  classificationPath: str,
429  threads: Optional[int] = -1
430 ) -> None:
431 
432  args = [
433  "qiime", "feature-classifier", "classify-sklearn",
434  "--i-classifier", classifierPath,
435  "--i-reads", readsPath,
436  "--o-classification", classificationPath
437  ]
438 
439  if threads == -1:
440  args.extend(["--p-n-jobs", "-1"])
441  elif threads is not None:
442  args.extend(["--p-n-jobs", str(threads)])
443 
444  command(args)
445 
446 
447 def taxaBarplot(
448  tablePath: str,
449  taxonomyPath: str,
450  metadataPath: str,
451  visualizationPath: str
452 ) -> None:
453 
454  command([
455  "qiime", "taxa", "barplot",
456  "--i-table", tablePath,
457  "--i-taxonomy", taxonomyPath,
458  "--m-metadata-file", metadataPath,
459  "--o-visualization", visualizationPath
460  ])
461 
462 
463 def featureTableFilterSamples(
464  tablePath: str,
465  metadataPath: str,
466  where: str,
467  filteredTablePath: str
468 ) -> None:
469 
470  command([
471  "qiime", "feature-table", "filter-samples",
472  "--i-table", tablePath,
473  "--m-metadata-file", metadataPath,
474  "--p-where", where,
475  "--o-filtered-table", filteredTablePath
476  ])
477 
478 
479 def compositionAddPseudocount(tablePath: str, compositionTablePath: str) -> None:
480  command([
481  "qiime", "composition", "add-pseudocount",
482  "--i-table", tablePath,
483  "--o-composition-table", compositionTablePath
484  ])
485 
486 
487 def compositionAncom(
488  tablePath: str,
489  metadataPath: str,
490  metadataColumn: str,
491  visualizationPath: str
492 ) -> None:
493 
494  command([
495  "qiime", "composition", "ancom",
496  "--i-table", tablePath,
497  "--m-metadata-file", metadataPath,
498  "--m-metadata-column", metadataColumn,
499  "--o-visualization", visualizationPath
500  ])
501 
502 
503 def taxaCollapse(
504  tablePath: str,
505  taxonomyPath: str,
506  level: int,
507  collapsedTablePath: str
508 ) -> None:
509 
510  command([
511  "qiime", "taxa", "collapse",
512  "--i-table", tablePath,
513  "--i-taxonomy", taxonomyPath,
514  "--p-level", str(level),
515  "--o-collapsed-table", collapsedTablePath
516  ])
517 
518 def vsearchClusterDeNovo(
519  tablePath: Union[str, Path],
520  representativeSequencesPath: Union[str, Path],
521  percIdentity: float,
522  clusteredTablePath: Union[str, Path],
523  clusteredSequencesPath: Union[str, Path],
524  threads: Optional[int] = -1
525 ) -> None:
526  """
527  Wrapper for QIIME2's vsearch de novo clusteing command.
528 
529  Parameters
530  ----------
531  tablePath : Union[str, Path]
532  Path to the feature table generated by DADA2 denoise
533  representativeSequencesPath : Union[str, Path]
534  Path to the represenative sequences generated by DADA2 denoise
535  percIdentity : float
536  Percent identiy threshold for the OTU clustering
537  clusteredTablePath : Union[str, Path]
538  Path to the output clustered feature table
539  clusteredSequencesPath : Union[str, Path]
540  Path to the output clustered sequences
541  multithreading : bool
542  Number of threads to use for the process. If None, multithreading will not be used.
543  Set to -1 (defaul) to use a thread for each CPU core
544  """
545 
546  if isinstance(tablePath, Path):
547  tablePath = str(tablePath)
548 
549  if isinstance(representativeSequencesPath, Path):
550  representativeSequencesPath = str(representativeSequencesPath)
551 
552  if isinstance(clusteredTablePath, Path):
553  clusteredTablePath = str(clusteredTablePath)
554 
555  if isinstance(clusteredSequencesPath, Path):
556  clusteredSequencesPath = str(clusteredSequencesPath)
557 
558  args = [
559  "qiime", "vsearch", "cluster-features-de-novo",
560  "--i-table", tablePath,
561  "--i-sequences", representativeSequencesPath,
562  "--p-perc-identity", str(percIdentity),
563  "--o-clustered-table", clusteredTablePath,
564  "--o-clustered-sequences", clusteredSequencesPath
565  ]
566 
567  if threads == -1:
568  args.extend(["--p-threads", "0"])
569  elif threads is not None:
570  args.extend(["--p-threads", str(threads)])
571 
572  command(args)
573 
574 
575 def vsearchClusterClosedReference(
576  tablePath: Union[str, Path],
577  representativeSequencesPath: Union[str, Path],
578  referenceSequencesPath: Union[str, Path],
579  percIdentity: float,
580  clusteredTablePath: Union[str, Path],
581  clusteredSequencesPath: Union[str, Path],
582  unmatchedSequencesPath: Union[str, Path],
583  threads: Optional[int] = -1
584 ) -> None:
585  """
586  Wrapper for QIIME2's vsearch closed reference clusteing command.
587 
588  Parameters
589  ----------
590  tablePath : Union[str, Path]
591  Path to the feature table generated by DADA2 denoise
592  representativeSequencesPath : Union[str, Path]
593  Path to the represenative sequences generated by DADA2 denoise
594  referenceSequencesPath : Union[str, Path]
595  Path to reference OTU sequences
596  percIdentity : float
597  Percent identiy threshold for the OTU clustering
598  clusteredTablePath : Union[str, Path]
599  Path to the output clustered feature table
600  clusteredSequencesPath : Union[str, Path]
601  Path to the output clustered sequences
602  unmatchedSequencesPath : Union[str, Path]
603  Path to the output unmatched sequences
604  multithreading : bool
605  Number of threads to use for the process. If None, multithreading will not be used.
606  Set to -1 (defaul) to use a thread for each CPU core
607  """
608 
609  if isinstance(tablePath, Path):
610  tablePath = str(tablePath)
611 
612  if isinstance(representativeSequencesPath, Path):
613  representativeSequencesPath = str(representativeSequencesPath)
614 
615  if isinstance(referenceSequencesPath, Path):
616  referenceSequencesPath = str(referenceSequencesPath)
617 
618  if isinstance(clusteredTablePath, Path):
619  clusteredTablePath = str(clusteredTablePath)
620 
621  if isinstance(clusteredSequencesPath, Path):
622  clusteredSequencesPath = str(clusteredSequencesPath)
623 
624  if isinstance(unmatchedSequencesPath, Path):
625  unmatchedSequencesPath = str(unmatchedSequencesPath)
626 
627  args = [
628  "qiime", "vsearch", "cluster-features-closed-reference",
629  "--i-table", tablePath,
630  "--i-sequences", representativeSequencesPath,
631  "--i-reference-sequences", referenceSequencesPath,
632  "--p-perc-identity", str(percIdentity),
633  "--o-clustered-table", clusteredTablePath,
634  "--o-clustered-sequences", clusteredSequencesPath,
635  "--o-unmatched-sequences", unmatchedSequencesPath
636  ]
637 
638  if threads == -1:
639  args.extend(["--p-threads", "0"])
640  elif threads is not None:
641  args.extend(["--p-threads", str(threads)])
642 
643  command(args)
644 
645 
646 def vsearchClusterOpenReference(
647  tablePath: Union[str, Path],
648  representativeSequencesPath: Union[str, Path],
649  referenceSequencesPath: Union[str, Path],
650  percIdentity: float,
651  clusteredTablePath: Union[str, Path],
652  clusteredSequencesPath: Union[str, Path],
653  newReferenceSequencesPath: Union[str, Path],
654  threads: Optional[int] = -1
655 ) -> None:
656  """
657  Wrapper for QIIME2's vsearch open reference clusteing command.
658 
659  Parameters
660  ----------
661  tablePath : Union[str, Path]
662  Path to the feature table generated by DADA2 denoise
663  representativeSequencesPath : Union[str, Path]
664  Path to the represenative sequences generated by DADA2 denoise
665  referenceSequencesPath : Union[str, Path]
666  Path to reference OTU sequences
667  percIdentity : float
668  Percent identiy threshold for the OTU clustering
669  clusteredTablePath : Union[str, Path]
670  Path to the output clustered feature table
671  clusteredSequencesPath : Union[str, Path]
672  Path to the output clustered sequences
673  newReferenceSequencesPath : Union[str, Path]
674  Path to the output new reference sequences
675  multithreading : bool
676  Number of threads to use for the process. If None, multithreading will not be used.
677  Set to -1 (defaul) to use a thread for each CPU core
678  """
679 
680  if isinstance(tablePath, Path):
681  tablePath = str(tablePath)
682 
683  if isinstance(representativeSequencesPath, Path):
684  representativeSequencesPath = str(representativeSequencesPath)
685 
686  if isinstance(referenceSequencesPath, Path):
687  referenceSequencesPath = str(referenceSequencesPath)
688 
689  if isinstance(clusteredTablePath, Path):
690  clusteredTablePath = str(clusteredTablePath)
691 
692  if isinstance(clusteredSequencesPath, Path):
693  clusteredSequencesPath = str(clusteredSequencesPath)
694 
695  if isinstance(newReferenceSequencesPath, Path):
696  newReferenceSequencesPath = str(newReferenceSequencesPath)
697 
698  args = [
699  "qiime", "vsearch", "cluster-features-open-reference",
700  "--i-table", tablePath,
701  "--i-sequences", representativeSequencesPath,
702  "--i-reference-sequences", referenceSequencesPath,
703  "--p-perc-identity", str(percIdentity),
704  "--o-clustered-table", clusteredTablePath,
705  "--o-clustered-sequences", clusteredSequencesPath,
706  "--o-new-reference-sequences", newReferenceSequencesPath
707  ]
708 
709  if threads == -1:
710  args.extend(["--p-threads", "0"])
711  elif threads is not None:
712  args.extend(["--p-threads", str(threads)])
713 
714  command(args)