| File: | file_morpho_stream.cc |
| Warning: | line 362, column 7 Value stored to 'symbol' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | /* |
| 2 | * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante |
| 3 | * |
| 4 | * This program is free software; you can redistribute it and/or |
| 5 | * modify it under the terms of the GNU General Public License as |
| 6 | * published by the Free Software Foundation; either version 2 of the |
| 7 | * License, or (at your option) any later version. |
| 8 | * |
| 9 | * This program is distributed in the hope that it will be useful, but |
| 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 12 | * General Public License for more details. |
| 13 | * |
| 14 | * You should have received a copy of the GNU General Public License |
| 15 | * along with this program; if not, see <https://www.gnu.org/licenses/>. |
| 16 | */ |
| 17 | /** |
| 18 | * Word class and MorphoStream class definitions |
| 19 | * |
| 20 | * @author Felipe Sánchez-Martínez |
| 21 | */ |
| 22 | |
| 23 | #include <apertium/file_morpho_stream.h> |
| 24 | #include <lttoolbox/string_utils.h> |
| 25 | #include "apertium_config.h" |
| 26 | #include <apertium/unlocked_cstdio.h> |
| 27 | |
| 28 | FileMorphoStream::FileMorphoStream(const char* ftxt, bool d, TaggerData *t) : |
| 29 | ms() { |
| 30 | foundEOF = false; |
| 31 | debug=d; |
| 32 | td = t; |
| 33 | me = td->getPatternList().newMatchExe(); |
| 34 | alphabet = td->getPatternList().getAlphabet(); |
| 35 | input.open(ftxt); |
| 36 | ca_any_char = alphabet(PatternList::ANY_CHAR); |
| 37 | ca_any_tag = alphabet(PatternList::ANY_TAG); |
| 38 | |
| 39 | ConstantManager &constants = td->getConstants(); |
| 40 | ca_kignorar = constants.getConstant("kIGNORAR"_u); |
| 41 | ca_kbarra = constants.getConstant("kBARRA"_u); |
| 42 | ca_kdollar = constants.getConstant("kDOLLAR"_u); |
| 43 | ca_kbegin = constants.getConstant("kBEGIN"_u); |
| 44 | ca_kmot = constants.getConstant("kMOT"_u); |
| 45 | ca_kmas = constants.getConstant("kMAS"_u); |
| 46 | ca_kunknown = constants.getConstant("kUNKNOWN"_u); |
| 47 | |
| 48 | map<UString, int> &tag_index = td->getTagIndex(); |
| 49 | ca_tag_keof = tag_index["TAG_kEOF"_u]; |
| 50 | ca_tag_kundef = tag_index["TAG_kUNDEF"_u]; |
| 51 | |
| 52 | end_of_file = false; |
| 53 | null_flush = false; |
| 54 | } |
| 55 | |
| 56 | FileMorphoStream::~FileMorphoStream() |
| 57 | { |
| 58 | delete me; |
| 59 | } |
| 60 | |
| 61 | TaggerWord * |
| 62 | FileMorphoStream::get_next_word() |
| 63 | { |
| 64 | if(vwords.size() != 0) |
| 65 | { |
| 66 | TaggerWord* word=vwords.front(); |
| 67 | vwords.erase(vwords.begin()); |
| 68 | |
| 69 | if(word->isAmbiguous()) |
| 70 | { |
| 71 | vector<UString> &ref = td->getDiscardRules(); |
| 72 | for(unsigned int i = 0; i < ref.size(); i++) |
| 73 | { |
| 74 | word->discardOnAmbiguity(ref[i]); |
| 75 | } |
| 76 | } |
| 77 | // cout << *word << endl; |
| 78 | return word; |
| 79 | } |
| 80 | |
| 81 | if(input.eof()) |
| 82 | { |
| 83 | return NULL__null; |
| 84 | } |
| 85 | |
| 86 | int ivwords = 0; |
| 87 | vwords.push_back(new TaggerWord()); |
| 88 | |
| 89 | while(true) |
| 90 | { |
| 91 | UChar32 symbol = input.get(); |
| 92 | if(input.eof() || (null_flush && symbol == '\0')) |
| 93 | { |
| 94 | end_of_file = true; |
| 95 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
| 96 | return get_next_word(); |
| 97 | } |
| 98 | if(symbol == '^') |
| 99 | { |
| 100 | readRestOfWord(ivwords); |
| 101 | return get_next_word(); |
| 102 | } |
| 103 | else |
| 104 | { |
| 105 | UString str = ""_u; |
| 106 | if(symbol == '\\') |
| 107 | { |
| 108 | symbol = input.get(); |
| 109 | str += '\\'; |
| 110 | str += symbol; |
| 111 | symbol = '\\'; |
| 112 | } |
| 113 | else |
| 114 | { |
| 115 | str += symbol; |
| 116 | } |
| 117 | |
| 118 | while(symbol != '^') |
| 119 | { |
| 120 | symbol = input.get(); |
| 121 | if(input.eof() || (null_flush && symbol == '\0')) { |
| 122 | end_of_file = true; |
| 123 | vwords[ivwords]->add_ignored_string(str); |
| 124 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
| 125 | return get_next_word(); |
| 126 | } else if(symbol == '\\') { |
| 127 | str += '\\'; |
| 128 | symbol = input.get(); |
| 129 | if(input.eof() || (null_flush && symbol == '\0')) { |
| 130 | end_of_file = true; |
| 131 | vwords[ivwords]->add_ignored_string(str); |
| 132 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
| 133 | return get_next_word(); |
| 134 | } |
| 135 | str += symbol; |
| 136 | symbol = '\\'; |
| 137 | } else if(symbol == '^') { |
| 138 | if(str.size() > 0) { |
| 139 | vwords[ivwords]->add_ignored_string(str); |
| 140 | } |
| 141 | readRestOfWord(ivwords); |
| 142 | return get_next_word(); |
| 143 | } else { |
| 144 | str += symbol; |
| 145 | } |
| 146 | } |
| 147 | } |
| 148 | } |
| 149 | } |
| 150 | |
| 151 | void |
| 152 | FileMorphoStream::lrlmClassify(UString const &str, int &ivwords) |
| 153 | { |
| 154 | int floor = 0; |
| 155 | int last_type = -1; |
| 156 | int last_pos = 0; |
| 157 | int initial_iv = ivwords; |
| 158 | |
| 159 | ms.init(me->getInitial()); |
| 160 | for(int i = 0, limit = str.size(); i != limit; i++) |
| 161 | { |
| 162 | if(str[i] != '<') |
| 163 | { |
| 164 | if(str[i] == '+') |
| 165 | { |
| 166 | int val = ms.classifyFinals(me->getFinals()); |
| 167 | if(val != -1) |
| 168 | { |
| 169 | last_pos = i-1; |
| 170 | last_type = val; |
| 171 | } |
| 172 | } |
| 173 | ms.step(u_toloweru_tolower_72(str[i]), ca_any_char); |
| 174 | } |
| 175 | else |
| 176 | { |
| 177 | UString tag; |
| 178 | for(int j = i+1; j != limit; j++) |
| 179 | { |
| 180 | if(str[j] == '\\') |
| 181 | { |
| 182 | j++; |
| 183 | } |
| 184 | else if(str[j] == '>') |
| 185 | { |
| 186 | tag = str.substr(i, j-i+1); |
| 187 | i = j; |
| 188 | break; |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | int symbol = alphabet(tag); |
| 193 | if(symbol) |
| 194 | { |
| 195 | ms.step(symbol, ca_any_tag); |
| 196 | } |
| 197 | else |
| 198 | { |
| 199 | ms.step(ca_any_tag); |
| 200 | } |
| 201 | } |
| 202 | |
| 203 | if(ms.size() == 0) |
| 204 | { |
| 205 | if(last_pos != floor) |
| 206 | { |
| 207 | vwords[ivwords]->add_tag(last_type, |
| 208 | str.substr(floor, last_pos - floor + 1), |
| 209 | td->getPreferRules()); |
| 210 | if(str[last_pos+1] == '+' && last_pos+1 < limit ) |
| 211 | { |
| 212 | floor = last_pos + 1; |
| 213 | last_pos = floor + 1; |
| 214 | vwords[ivwords]->set_plus_cut(true); |
| 215 | if (((int)vwords.size())<=((int)(ivwords+1))) |
| 216 | vwords.push_back(new TaggerWord(true)); |
| 217 | ivwords++; |
| 218 | ms.init(me->getInitial()); |
| 219 | } |
| 220 | i = floor++; |
| 221 | } |
| 222 | else |
| 223 | { |
| 224 | if (debug) |
| 225 | { |
| 226 | cerr<<"Warning: There is not coarse tag for the fine tag '"<< str.substr(floor) <<"' of '" << str << "'\n"; |
| 227 | cerr<<" This is because of an incomplete tagset definition or a dictionary error\n"; |
| 228 | } |
| 229 | vwords[ivwords]->add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); |
| 230 | return; |
| 231 | } |
| 232 | } |
| 233 | else if(i == limit - 1) |
| 234 | { |
| 235 | if(ms.classifyFinals(me->getFinals()) == -1) |
| 236 | { |
| 237 | if(last_pos != floor) |
| 238 | { |
| 239 | vwords[ivwords]->add_tag(last_type, |
| 240 | str.substr(floor, last_pos - floor + 1), |
| 241 | td->getPreferRules()); |
| 242 | if(str[last_pos+1] == '+' && last_pos+1 < limit ) |
| 243 | { |
| 244 | floor = last_pos + 1; |
| 245 | last_pos = floor; |
| 246 | vwords[ivwords]->set_plus_cut(true); |
| 247 | if (((int)vwords.size())<=((int)(ivwords+1))) |
| 248 | vwords.push_back(new TaggerWord(true)); |
| 249 | ivwords++; |
| 250 | ms.init(me->getInitial()); |
| 251 | } |
| 252 | i = floor++; |
| 253 | } |
| 254 | else |
| 255 | { |
| 256 | if (debug) |
| 257 | { |
| 258 | cerr<<"Warning: There is not coarse tag for the fine tag '"<< str.substr(floor) <<"' of '" << str << "'\n"; |
| 259 | cerr<<" This is because of an incomplete tagset definition or a dictionary error\n"; |
| 260 | } |
| 261 | vwords[ivwords]->add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); |
| 262 | return; |
| 263 | } |
| 264 | } |
| 265 | } |
| 266 | } |
| 267 | |
| 268 | int val = ms.classifyFinals(me->getFinals()); |
| 269 | if(val == -1) |
| 270 | { |
| 271 | val = ca_tag_kundef; |
| 272 | if (debug) |
| 273 | { |
| 274 | cerr<<"Warning: There is not coarse tag for the fine tag '"<< str.substr(floor) <<"' of '" << str << "'\n"; |
| 275 | cerr<<" This is because of an incomplete tagset definition or a dictionary error\n"; |
| 276 | } |
| 277 | if(ivwords > initial_iv) { |
| 278 | // We've partially added a multiword -- undo the previous add to avoid outputting a partial (chopped off) lexical form: |
| 279 | while(ivwords > initial_iv) { |
| 280 | delete vwords[ivwords]; |
| 281 | vwords.pop_back(); |
| 282 | ivwords--; |
| 283 | } |
| 284 | vwords[ivwords]->set_plus_cut(false); |
| 285 | vwords[ivwords]->erase_tag(last_type); |
| 286 | vwords[ivwords]->add_tag(last_type, str, td->getPreferRules()); |
| 287 | return; |
| 288 | } |
| 289 | } |
| 290 | vwords[ivwords]->add_tag(val, str.substr(floor), td->getPreferRules()); |
| 291 | } |
| 292 | |
| 293 | void |
| 294 | FileMorphoStream::readRestOfWord(int &ivwords) |
| 295 | { |
| 296 | // first we have the superficial form |
| 297 | UString str; |
| 298 | |
| 299 | while(true) |
| 300 | { |
| 301 | UChar32 symbol = input.get(); |
| 302 | if(input.eof() || (null_flush && symbol == '\0')) |
| 303 | { |
| 304 | end_of_file = true; |
| 305 | if(str.size() > 0) |
| 306 | { |
| 307 | vwords[ivwords]->add_ignored_string(str); |
| 308 | cerr<<"Warning (internal): kIGNORE was returned while reading a word\n"; |
| 309 | cerr<<"Word being read: "<<vwords[ivwords]->get_superficial_form()<<"\n"; |
| 310 | cerr<<"Debug: "<< str <<"\n"; |
| 311 | } |
| 312 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
| 313 | return; |
| 314 | } |
| 315 | else if(symbol == '\\') |
| 316 | { |
| 317 | symbol = input.get(); |
| 318 | str += '\\'; |
| 319 | str += symbol; |
| 320 | } |
| 321 | else if(symbol == '/') |
| 322 | { |
| 323 | vwords[ivwords]->set_superficial_form(str); |
| 324 | str.clear(); |
| 325 | break; |
| 326 | } |
| 327 | else if(symbol == '$') |
| 328 | { |
| 329 | vwords[ivwords]->set_superficial_form(str); |
| 330 | vwords[ivwords]->add_ignored_string("$"_u); |
| 331 | break; |
| 332 | } |
| 333 | else |
| 334 | { |
| 335 | str += symbol; |
| 336 | } |
| 337 | } |
| 338 | |
| 339 | // then we read the acceptions |
| 340 | |
| 341 | while(true) |
| 342 | { |
| 343 | UChar32 symbol = input.get(); |
| 344 | if(input.eof() || (null_flush && symbol == '\0')) |
| 345 | { |
| 346 | end_of_file = true; |
| 347 | if(str.size() > 0) |
| 348 | { |
| 349 | vwords[ivwords]->add_ignored_string(str); |
| 350 | cerr<<"Warning (internal): kIGNORE was returned while reading a word\n"; |
| 351 | cerr<<"Word being read: "<<vwords[ivwords]->get_superficial_form()<<"\n"; |
| 352 | cerr<<"Debug: "<< str <<"\n"; |
| 353 | } |
| 354 | vwords[ivwords]->add_tag(ca_tag_keof, ""_u, td->getPreferRules()); |
| 355 | return; |
| 356 | } |
| 357 | else if(symbol == '\\') |
| 358 | { |
| 359 | symbol = input.get(); |
| 360 | str += '\\'; |
| 361 | str += symbol; |
| 362 | symbol = '\\'; // to prevent exiting with '\$' |
Value stored to 'symbol' is never read | |
| 363 | } |
| 364 | else if(symbol == '/') |
| 365 | { |
| 366 | lrlmClassify(str, ivwords); |
| 367 | str.clear(); |
| 368 | ivwords = 0; |
| 369 | continue; |
| 370 | } |
| 371 | else if(symbol == '$') |
| 372 | { |
| 373 | if(str[0] != '*')// do nothing with unknown words |
| 374 | { |
| 375 | lrlmClassify(str, ivwords); |
| 376 | } |
| 377 | return; |
| 378 | } |
| 379 | else |
| 380 | { |
| 381 | str += symbol; |
| 382 | } |
| 383 | } |
| 384 | } |
| 385 | |
| 386 | void |
| 387 | FileMorphoStream::setNullFlush(bool nf) |
| 388 | { |
| 389 | null_flush = nf; |
| 390 | } |
| 391 | |
| 392 | bool |
| 393 | FileMorphoStream::getEndOfFile(void) |
| 394 | { |
| 395 | return end_of_file; |
| 396 | } |
| 397 | |
| 398 | void |
| 399 | FileMorphoStream::setEndOfFile(bool eof) |
| 400 | { |
| 401 | end_of_file = eof; |
| 402 | } |
| 403 | |
| 404 | void |
| 405 | FileMorphoStream::rewind() |
| 406 | { |
| 407 | input.rewind(); |
| 408 | end_of_file = false; |
| 409 | } |