Coretex
token.py
1 # Copyright (C) 2023 Coretex LLC
2 
3 # This file is part of Coretex.ai
4 
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License as
7 # published by the Free Software Foundation, either version 3 of the
8 # License, or (at your option) any later version.
9 
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU Affero General Public License for more details.
14 
15 # You should have received a copy of the GNU Affero General Public License
16 # along with this program. If not, see <https://www.gnu.org/licenses/>.
17 
18 # from typing import List, Optional
19 # from typing_extensions import Self
20 
21 # from deepspeech import TokenMetadata
22 
23 # from ..codable import Codable
24 
25 
26 # class Token(Codable):
27 
28 # """
29 # Represents a single token from text
30 
31 # Properties
32 # ----------
33 # text : str
34 # textual value of the token
35 # startIndex : int
36 # starting index of the token
37 # endIndex : int
38 # ending index of the token
39 # startTime : Optional[float]
40 # starting time of token in audio file
41 # (only if token was extracted from audio transcription)
42 # endTime : Optional[float]
43 # ending time of token in audio file
44 # (only if token was extracted from audio transcription)
45 # """
46 
47 # text: str
48 # startIndex: int
49 # endIndex: int
50 # startTime: Optional[float]
51 # endTime: Optional[float]
52 
53 # @classmethod
54 # def create(
55 # cls,
56 # text: str,
57 # startIndex: int,
58 # endIndex: int,
59 # startTime: Optional[float],
60 # endTime: Optional[float]
61 # ) -> Self:
62 
63 # obj = cls()
64 
65 # obj.text = text
66 # obj.startIndex = startIndex
67 # obj.endIndex = endIndex
68 # obj.startTime = startTime
69 # obj.endTime = endTime
70 
71 # return obj
72 
73 # @classmethod
74 # def fromTokenMetadata(cls, tokenMetadata: List[TokenMetadata]) -> List[Self]:
75 # """
76 # Creates a list of tokens from output of the deepspeech model
77 
78 # Parameters
79 # ----------
80 # tokenMetadata : List[TokenMetadata]
81 # output of deepspeech model
82 
83 # Returns
84 # -------
85 # List[Self] -> list of tokens
86 # """
87 
88 # tokens: List[Self] = []
89 
90 # startIndex: Optional[int] = None
91 # startTime: Optional[float] = None
92 # characters: List[str] = []
93 
94 # for currentIndex, element in enumerate(tokenMetadata):
95 # if startIndex is None and len(characters) == 0:
96 # startIndex = currentIndex
97 
98 # if startTime is None and len(characters) == 0:
99 # startTime = element.start_time
100 
101 # if element.text.isspace() and startIndex is not None and startTime is not None and len(characters) > 0:
102 # token = cls.create("".join(characters), startIndex, currentIndex, startTime, element.start_time)
103 # tokens.append(token)
104 
105 # startIndex = None
106 # startTime = None
107 # characters.clear()
108 
109 # continue
110 
111 # if not element.text.isspace():
112 # characters.append(element.text)
113 
114 # if startIndex is not None and startTime is not None and len(characters) > 0:
115 # token = cls.create("".join(characters), startIndex, currentIndex, startTime, element.start_time)
116 # tokens.append(token)
117 
118 # return tokens
119 
120 # @classmethod
121 # def fromText(cls, text: str) -> List[Self]:
122 # """
123 # Tokenizes provided text
124 
125 # Parameters:
126 # text: str -> text to be tokenized
127 
128 # Retursn:
129 # List[Self] -> list of tokens
130 # """
131 
132 # tokens: List[Self] = []
133 
134 # startIndex: Optional[int] = None
135 # characters: List[str] = []
136 
137 # for currentIndex, character in enumerate(text):
138 # if startIndex is None and len(characters) == 0:
139 # startIndex = currentIndex
140 
141 # if character.isspace() and startIndex is not None and len(characters) > 0:
142 # token = cls.create("".join(characters), startIndex, currentIndex, None, None)
143 # tokens.append(token)
144 
145 # startIndex = None
146 # characters.clear()
147 
148 # continue
149 
150 # if not character.isspace():
151 # characters.append(character)
152 
153 # if startIndex is not None and len(characters) > 0:
154 # token = cls.create("".join(characters), startIndex, currentIndex, None, None)
155 # tokens.append(token)
156 
157 # return tokens