File: | file_morpho_stream.cc |
Warning: | line 362, column 7 Value stored to 'symbol' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* |
2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante |
3 | * |
4 | * This program is free software; you can redistribute it and/or |
5 | * modify it under the terms of the GNU General Public License as |
6 | * published by the Free Software Foundation; either version 2 of the |
7 | * License, or (at your option) any later version. |
8 | * |
9 | * This program is distributed in the hope that it will be useful, but |
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | * General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU General Public License |
15 | * along with this program; if not, see <https://www.gnu.org/licenses/>. |
16 | */ |
17 | /** |
18 | * Word class and MorphoStream class definitions |
19 | * |
20 | * @author Felipe Sánchez-Martínez |
21 | */ |
22 | |
23 | #include <apertium/file_morpho_stream.h> |
24 | #include <lttoolbox/string_utils.h> |
25 | #include "apertium_config.h" |
26 | #include <apertium/unlocked_cstdio.h> |
27 | |
28 | FileMorphoStream::FileMorphoStream(const char* ftxt, bool d, TaggerData *t) : |
29 | ms() { |
30 | foundEOF = false; |
31 | debug=d; |
32 | td = t; |
33 | me = td->getPatternList().newMatchExe(); |
34 | alphabet = td->getPatternList().getAlphabet(); |
35 | input.open(ftxt); |
36 | ca_any_char = alphabet(PatternList::ANY_CHAR); |
37 | ca_any_tag = alphabet(PatternList::ANY_TAG); |
38 | |
39 | ConstantManager &constants = td->getConstants(); |
40 | ca_kignorar = constants.getConstant("kIGNORAR"_u); |
41 | ca_kbarra = constants.getConstant("kBARRA"_u); |
42 | ca_kdollar = constants.getConstant("kDOLLAR"_u); |
43 | ca_kbegin = constants.getConstant("kBEGIN"_u); |
44 | ca_kmot = constants.getConstant("kMOT"_u); |
45 | ca_kmas = constants.getConstant("kMAS"_u); |
46 | ca_kunknown = constants.getConstant("kUNKNOWN"_u); |
47 | |
48 | map<UString, int> &tag_index = td->getTagIndex(); |
49 | ca_tag_keof = tag_index["TAG_kEOF"_u]; |
50 | ca_tag_kundef = tag_index["TAG_kUNDEF"_u]; |
51 | |
52 | end_of_file = false; |
53 | null_flush = false; |
54 | } |
55 | |
56 | FileMorphoStream::~FileMorphoStream() |
57 | { |
58 | delete me; |
59 | } |
60 | |
61 | TaggerWord * |
62 | FileMorphoStream::get_next_word() |
63 | { |
64 | if(vwords.size() != 0) |
65 | { |
66 | TaggerWord* word=vwords.front(); |
67 | vwords.erase(vwords.begin()); |
68 | |
69 | if(word->isAmbiguous()) |
70 | { |
71 | vector<UString> &ref = td->getDiscardRules(); |
72 | for(unsigned int i = 0; i < ref.size(); i++) |
73 | { |
74 | word->discardOnAmbiguity(ref[i]); |
75 | } |
76 | } |
77 | // cout << *word << endl; |
78 | return word; |
79 | } |
80 | |
81 | if(input.eof()) |
82 | { |
83 | return NULL__null; |
84 | } |
85 | |
86 | int ivwords = 0; |
87 | vwords.push_back(new TaggerWord()); |
88 | |
89 | while(true) |
90 | { |
91 | UChar32 symbol = input.get(); |
92 | if(input.eof() || (null_flush && symbol == '\0')) |
93 | { |
94 | end_of_file = true; |
95 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
96 | return get_next_word(); |
97 | } |
98 | if(symbol == '^') |
99 | { |
100 | readRestOfWord(ivwords); |
101 | return get_next_word(); |
102 | } |
103 | else |
104 | { |
105 | UString str = ""_u; |
106 | if(symbol == '\\') |
107 | { |
108 | symbol = input.get(); |
109 | str += '\\'; |
110 | str += symbol; |
111 | symbol = '\\'; |
112 | } |
113 | else |
114 | { |
115 | str += symbol; |
116 | } |
117 | |
118 | while(symbol != '^') |
119 | { |
120 | symbol = input.get(); |
121 | if(input.eof() || (null_flush && symbol == '\0')) { |
122 | end_of_file = true; |
123 | vwords[ivwords]->add_ignored_string(str); |
124 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
125 | return get_next_word(); |
126 | } else if(symbol == '\\') { |
127 | str += '\\'; |
128 | symbol = input.get(); |
129 | if(input.eof() || (null_flush && symbol == '\0')) { |
130 | end_of_file = true; |
131 | vwords[ivwords]->add_ignored_string(str); |
132 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
133 | return get_next_word(); |
134 | } |
135 | str += symbol; |
136 | symbol = '\\'; |
137 | } else if(symbol == '^') { |
138 | if(str.size() > 0) { |
139 | vwords[ivwords]->add_ignored_string(str); |
140 | } |
141 | readRestOfWord(ivwords); |
142 | return get_next_word(); |
143 | } else { |
144 | str += symbol; |
145 | } |
146 | } |
147 | } |
148 | } |
149 | } |
150 | |
151 | void |
152 | FileMorphoStream::lrlmClassify(UString const &str, int &ivwords) |
153 | { |
154 | int floor = 0; |
155 | int last_type = -1; |
156 | int last_pos = 0; |
157 | int initial_iv = ivwords; |
158 | |
159 | ms.init(me->getInitial()); |
160 | for(int i = 0, limit = str.size(); i != limit; i++) |
161 | { |
162 | if(str[i] != '<') |
163 | { |
164 | if(str[i] == '+') |
165 | { |
166 | int val = ms.classifyFinals(me->getFinals()); |
167 | if(val != -1) |
168 | { |
169 | last_pos = i-1; |
170 | last_type = val; |
171 | } |
172 | } |
173 | ms.step(u_toloweru_tolower_72(str[i]), ca_any_char); |
174 | } |
175 | else |
176 | { |
177 | UString tag; |
178 | for(int j = i+1; j != limit; j++) |
179 | { |
180 | if(str[j] == '\\') |
181 | { |
182 | j++; |
183 | } |
184 | else if(str[j] == '>') |
185 | { |
186 | tag = str.substr(i, j-i+1); |
187 | i = j; |
188 | break; |
189 | } |
190 | } |
191 | |
192 | int symbol = alphabet(tag); |
193 | if(symbol) |
194 | { |
195 | ms.step(symbol, ca_any_tag); |
196 | } |
197 | else |
198 | { |
199 | ms.step(ca_any_tag); |
200 | } |
201 | } |
202 | |
203 | if(ms.size() == 0) |
204 | { |
205 | if(last_pos != floor) |
206 | { |
207 | vwords[ivwords]->add_tag(last_type, |
208 | str.substr(floor, last_pos - floor + 1), |
209 | td->getPreferRules()); |
210 | if(str[last_pos+1] == '+' && last_pos+1 < limit ) |
211 | { |
212 | floor = last_pos + 1; |
213 | last_pos = floor + 1; |
214 | vwords[ivwords]->set_plus_cut(true); |
215 | if (((int)vwords.size())<=((int)(ivwords+1))) |
216 | vwords.push_back(new TaggerWord(true)); |
217 | ivwords++; |
218 | ms.init(me->getInitial()); |
219 | } |
220 | i = floor++; |
221 | } |
222 | else |
223 | { |
224 | if (debug) |
225 | { |
226 | cerr<<"Warning: There is not coarse tag for the fine tag '"<< str.substr(floor) <<"' of '" << str << "'\n"; |
227 | cerr<<" This is because of an incomplete tagset definition or a dictionary error\n"; |
228 | } |
229 | vwords[ivwords]->add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); |
230 | return; |
231 | } |
232 | } |
233 | else if(i == limit - 1) |
234 | { |
235 | if(ms.classifyFinals(me->getFinals()) == -1) |
236 | { |
237 | if(last_pos != floor) |
238 | { |
239 | vwords[ivwords]->add_tag(last_type, |
240 | str.substr(floor, last_pos - floor + 1), |
241 | td->getPreferRules()); |
242 | if(str[last_pos+1] == '+' && last_pos+1 < limit ) |
243 | { |
244 | floor = last_pos + 1; |
245 | last_pos = floor; |
246 | vwords[ivwords]->set_plus_cut(true); |
247 | if (((int)vwords.size())<=((int)(ivwords+1))) |
248 | vwords.push_back(new TaggerWord(true)); |
249 | ivwords++; |
250 | ms.init(me->getInitial()); |
251 | } |
252 | i = floor++; |
253 | } |
254 | else |
255 | { |
256 | if (debug) |
257 | { |
258 | cerr<<"Warning: There is not coarse tag for the fine tag '"<< str.substr(floor) <<"' of '" << str << "'\n"; |
259 | cerr<<" This is because of an incomplete tagset definition or a dictionary error\n"; |
260 | } |
261 | vwords[ivwords]->add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); |
262 | return; |
263 | } |
264 | } |
265 | } |
266 | } |
267 | |
268 | int val = ms.classifyFinals(me->getFinals()); |
269 | if(val == -1) |
270 | { |
271 | val = ca_tag_kundef; |
272 | if (debug) |
273 | { |
274 | cerr<<"Warning: There is not coarse tag for the fine tag '"<< str.substr(floor) <<"' of '" << str << "'\n"; |
275 | cerr<<" This is because of an incomplete tagset definition or a dictionary error\n"; |
276 | } |
277 | if(ivwords > initial_iv) { |
278 | // We've partially added a multiword -- undo the previous add to avoid outputting a partial (chopped off) lexical form: |
279 | while(ivwords > initial_iv) { |
280 | delete vwords[ivwords]; |
281 | vwords.pop_back(); |
282 | ivwords--; |
283 | } |
284 | vwords[ivwords]->set_plus_cut(false); |
285 | vwords[ivwords]->erase_tag(last_type); |
286 | vwords[ivwords]->add_tag(last_type, str, td->getPreferRules()); |
287 | return; |
288 | } |
289 | } |
290 | vwords[ivwords]->add_tag(val, str.substr(floor), td->getPreferRules()); |
291 | } |
292 | |
293 | void |
294 | FileMorphoStream::readRestOfWord(int &ivwords) |
295 | { |
296 | // first we have the superficial form |
297 | UString str; |
298 | |
299 | while(true) |
300 | { |
301 | UChar32 symbol = input.get(); |
302 | if(input.eof() || (null_flush && symbol == '\0')) |
303 | { |
304 | end_of_file = true; |
305 | if(str.size() > 0) |
306 | { |
307 | vwords[ivwords]->add_ignored_string(str); |
308 | cerr<<"Warning (internal): kIGNORE was returned while reading a word\n"; |
309 | cerr<<"Word being read: "<<vwords[ivwords]->get_superficial_form()<<"\n"; |
310 | cerr<<"Debug: "<< str <<"\n"; |
311 | } |
312 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
313 | return; |
314 | } |
315 | else if(symbol == '\\') |
316 | { |
317 | symbol = input.get(); |
318 | str += '\\'; |
319 | str += symbol; |
320 | } |
321 | else if(symbol == '/') |
322 | { |
323 | vwords[ivwords]->set_superficial_form(str); |
324 | str.clear(); |
325 | break; |
326 | } |
327 | else if(symbol == '$') |
328 | { |
329 | vwords[ivwords]->set_superficial_form(str); |
330 | vwords[ivwords]->add_ignored_string("$"_u); |
331 | break; |
332 | } |
333 | else |
334 | { |
335 | str += symbol; |
336 | } |
337 | } |
338 | |
339 | // then we read the acceptions |
340 | |
341 | while(true) |
342 | { |
343 | UChar32 symbol = input.get(); |
344 | if(input.eof() || (null_flush && symbol == '\0')) |
345 | { |
346 | end_of_file = true; |
347 | if(str.size() > 0) |
348 | { |
349 | vwords[ivwords]->add_ignored_string(str); |
350 | cerr<<"Warning (internal): kIGNORE was returned while reading a word\n"; |
351 | cerr<<"Word being read: "<<vwords[ivwords]->get_superficial_form()<<"\n"; |
352 | cerr<<"Debug: "<< str <<"\n"; |
353 | } |
354 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
355 | return; |
356 | } |
357 | else if(symbol == '\\') |
358 | { |
359 | symbol = input.get(); |
360 | str += '\\'; |
361 | str += symbol; |
362 | symbol = '\\'; // to prevent exiting with '\$' |
Value stored to 'symbol' is never read | |
363 | } |
364 | else if(symbol == '/') |
365 | { |
366 | lrlmClassify(str, ivwords); |
367 | str.clear(); |
368 | ivwords = 0; |
369 | continue; |
370 | } |
371 | else if(symbol == '$') |
372 | { |
373 | if(str[0] != '*')// do nothing with unknown words |
374 | { |
375 | lrlmClassify(str, ivwords); |
376 | } |
377 | return; |
378 | } |
379 | else |
380 | { |
381 | str += symbol; |
382 | } |
383 | } |
384 | } |
385 | |
386 | void |
387 | FileMorphoStream::setNullFlush(bool nf) |
388 | { |
389 | null_flush = nf; |
390 | } |
391 | |
392 | bool |
393 | FileMorphoStream::getEndOfFile(void) |
394 | { |
395 | return end_of_file; |
396 | } |
397 | |
398 | void |
399 | FileMorphoStream::setEndOfFile(bool eof) |
400 | { |
401 | end_of_file = eof; |
402 | } |
403 | |
404 | void |
405 | FileMorphoStream::rewind() |
406 | { |
407 | input.rewind(); |
408 | end_of_file = false; |
409 | } |