Coretex
token.py
1
# Copyright (C) 2023 Coretex LLC
2
3
# This file is part of Coretex.ai
4
5
# This program is free software: you can redistribute it and/or modify
6
# it under the terms of the GNU Affero General Public License as
7
# published by the Free Software Foundation, either version 3 of the
8
# License, or (at your option) any later version.
9
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU Affero General Public License for more details.
14
15
# You should have received a copy of the GNU Affero General Public License
16
# along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18
# from typing import List, Optional
19
# from typing_extensions import Self
20
21
# from deepspeech import TokenMetadata
22
23
# from ..codable import Codable
24
25
26
# class Token(Codable):
27
28
# """
29
# Represents a single token from text
30
31
# Properties
32
# ----------
33
# text : str
34
# textual value of the token
35
# startIndex : int
36
# starting index of the token
37
# endIndex : int
38
# ending index of the token
39
# startTime : Optional[float]
40
# starting time of token in audio file
41
# (only if token was extracted from audio transcription)
42
# endTime : Optional[float]
43
# ending time of token in audio file
44
# (only if token was extracted from audio transcription)
45
# """
46
47
# text: str
48
# startIndex: int
49
# endIndex: int
50
# startTime: Optional[float]
51
# endTime: Optional[float]
52
53
# @classmethod
54
# def create(
55
# cls,
56
# text: str,
57
# startIndex: int,
58
# endIndex: int,
59
# startTime: Optional[float],
60
# endTime: Optional[float]
61
# ) -> Self:
62
63
# obj = cls()
64
65
# obj.text = text
66
# obj.startIndex = startIndex
67
# obj.endIndex = endIndex
68
# obj.startTime = startTime
69
# obj.endTime = endTime
70
71
# return obj
72
73
# @classmethod
74
# def fromTokenMetadata(cls, tokenMetadata: List[TokenMetadata]) -> List[Self]:
75
# """
76
# Creates a list of tokens from output of the deepspeech model
77
78
# Parameters
79
# ----------
80
# tokenMetadata : List[TokenMetadata]
81
# output of deepspeech model
82
83
# Returns
84
# -------
85
# List[Self] -> list of tokens
86
# """
87
88
# tokens: List[Self] = []
89
90
# startIndex: Optional[int] = None
91
# startTime: Optional[float] = None
92
# characters: List[str] = []
93
94
# for currentIndex, element in enumerate(tokenMetadata):
95
# if startIndex is None and len(characters) == 0:
96
# startIndex = currentIndex
97
98
# if startTime is None and len(characters) == 0:
99
# startTime = element.start_time
100
101
# if element.text.isspace() and startIndex is not None and startTime is not None and len(characters) > 0:
102
# token = cls.create("".join(characters), startIndex, currentIndex, startTime, element.start_time)
103
# tokens.append(token)
104
105
# startIndex = None
106
# startTime = None
107
# characters.clear()
108
109
# continue
110
111
# if not element.text.isspace():
112
# characters.append(element.text)
113
114
# if startIndex is not None and startTime is not None and len(characters) > 0:
115
# token = cls.create("".join(characters), startIndex, currentIndex, startTime, element.start_time)
116
# tokens.append(token)
117
118
# return tokens
119
120
# @classmethod
121
# def fromText(cls, text: str) -> List[Self]:
122
# """
123
# Tokenizes provided text
124
125
# Parameters:
126
# text: str -> text to be tokenized
127
128
# Retursn:
129
# List[Self] -> list of tokens
130
# """
131
132
# tokens: List[Self] = []
133
134
# startIndex: Optional[int] = None
135
# characters: List[str] = []
136
137
# for currentIndex, character in enumerate(text):
138
# if startIndex is None and len(characters) == 0:
139
# startIndex = currentIndex
140
141
# if character.isspace() and startIndex is not None and len(characters) > 0:
142
# token = cls.create("".join(characters), startIndex, currentIndex, None, None)
143
# tokens.append(token)
144
145
# startIndex = None
146
# characters.clear()
147
148
# continue
149
150
# if not character.isspace():
151
# characters.append(character)
152
153
# if startIndex is not None and len(characters) > 0:
154
# token = cls.create("".join(characters), startIndex, currentIndex, None, None)
155
# tokens.append(token)
156
157
# return tokens
coretex
nlp
token.py
Generated by
1.9.1