commit 11c56c83427b8401dc1de0f85f972d69355362c7 Author: Daniel Swanson Date: Tue Jul 6 14:23:15 2021 -0500 rewrite stream.h/cc to use InputFile also drop tag.h/cc since it was barely more than a typedef anyway diff --git a/apertium/Makefile.am b/apertium/Makefile.am index b731e60..5327ef2 100644 --- a/apertium/Makefile.am +++ b/apertium/Makefile.am @@ -39,7 +39,6 @@ h_sources = a.h \ streamed_type.h \ string_to_wostream.h \ shell_utils.h \ - tag.h \ tagger.h \ tagger_data.h \ tagger_data_hmm.h \ @@ -109,7 +108,6 @@ cc_sources = a.cc \ stream.cc \ stream_tagger.cc \ shell_utils.cc \ - tag.cc \ tagger.cc \ tagger_data.cc \ tagger_data_hmm.cc \ diff --git a/apertium/a.h b/apertium/a.h index bd60712..3140f95 100644 --- a/apertium/a.h +++ b/apertium/a.h @@ -18,7 +18,6 @@ #include "analysis.h" #include "morpheme.h" -#include "tag.h" #include @@ -29,7 +28,7 @@ public: friend bool operator<(const a &a_, const a &b_); a(); a(const Analysis &Analysis_); - std::vector TheTags; + std::vector TheTags; std::vector TheMorphemes; }; } diff --git a/apertium/analysis.cc b/apertium/analysis.cc index b3394ba..48dfcd3 100644 --- a/apertium/analysis.cc +++ b/apertium/analysis.cc @@ -51,4 +51,19 @@ Analysis::operator UString() const { return UString_; } + +void +Analysis::read(InputFile& in) +{ + UChar32 c; + do { + TheMorphemes.push_back(Morpheme()); + TheMorphemes.back().read(in); + c = in.get(); + } while (c == '+'); + if (in.eof() || c == '\0') { + throw Exception::Stream::UnexpectedEndOfFile("Unterminated lexical unit"); + } + in.unget(c); // leave $ or / for caller +} } diff --git a/apertium/analysis.h b/apertium/analysis.h index 194bc60..f0739d9 100644 --- a/apertium/analysis.h +++ b/apertium/analysis.h @@ -22,6 +22,7 @@ #include #include #include +#include namespace Apertium { class Analysis { @@ -31,6 +32,7 @@ public: friend bool operator==(const Analysis &a, const Analysis &b); friend bool operator<(const Analysis &a, const Analysis &b); operator UString() const; + void read(InputFile& in); std::vector TheMorphemes; }; } diff --git a/apertium/apertium_perceptron_trace.cc b/apertium/apertium_perceptron_trace.cc index f006a93..52077f4 100644 --- a/apertium/apertium_perceptron_trace.cc +++ b/apertium/apertium_perceptron_trace.cc @@ -41,13 +41,8 @@ int perceptron_trace(int argc, char* argv[]) PerceptronTagger pt(flags); pt.read_spec(argv[2]); - std::ifstream untagged_stream; - try_open_fstream("UNTAGGED_CORPUS", argv[3], untagged_stream); - Stream untagged(flags, untagged_stream, argv[3]); - - std::ifstream tagged_stream; - try_open_fstream("TAGGED_CORPUS", argv[4], tagged_stream); - Stream tagged(flags, tagged_stream, argv[4]); + Stream untagged(flags, argv[3]); + Stream tagged(flags, argv[4]); TrainingCorpus tc(tagged, untagged, false, false); diff --git a/apertium/deserialiser.h b/apertium/deserialiser.h index 2f90ea2..e0a4136 100644 --- a/apertium/deserialiser.h +++ b/apertium/deserialiser.h @@ -21,7 +21,6 @@ #include "i.h" #include "lemma.h" #include "morpheme.h" -#include "tag.h" #include "apertium_config.h" #include @@ -62,14 +61,9 @@ public: inline static Morpheme deserialise(std::istream &Stream_); }; -template <> class Deserialiser { -public: - inline static Tag deserialise(std::istream &Stream_); -}; - a Deserialiser::deserialise(std::istream &Stream_) { a StreamedType_; - StreamedType_.TheTags = Deserialiser >::deserialise(Stream_); + StreamedType_.TheTags = Deserialiser >::deserialise(Stream_); StreamedType_.TheMorphemes = Deserialiser >::deserialise(Stream_); return StreamedType_; @@ -84,7 +78,7 @@ Analysis Deserialiser::deserialise(std::istream &Stream_) { i Deserialiser::deserialise(std::istream &Stream_) { i StreamedType_; - StreamedType_.TheTags = Deserialiser >::deserialise(Stream_); + StreamedType_.TheTags = Deserialiser >::deserialise(Stream_); return StreamedType_; } @@ -98,13 +92,7 @@ Morpheme Deserialiser::deserialise(std::istream &Stream_) { Morpheme SerialisedType_; SerialisedType_.TheLemma = Deserialiser::deserialise(Stream_); SerialisedType_.TheTags = - Deserialiser >::deserialise(Stream_); - return SerialisedType_; -} - -Tag Deserialiser::deserialise(std::istream &Stream_) { - Tag SerialisedType_; - SerialisedType_.TheTag = Deserialiser::deserialise(Stream_); + Deserialiser >::deserialise(Stream_); return SerialisedType_; } diff --git a/apertium/i.h b/apertium/i.h index 40a3bfd..0c895c9 100644 --- a/apertium/i.h +++ b/apertium/i.h @@ -18,7 +18,6 @@ #include "analysis.h" #include "morpheme.h" -#include "tag.h" #include @@ -31,7 +30,7 @@ public: i(); i(const Analysis &Analysis_); i(const Morpheme &Morpheme_); - std::vector TheTags; + std::vector TheTags; }; } diff --git a/apertium/lexical_unit.h b/apertium/lexical_unit.h index 7599caa..e7777f3 100644 --- a/apertium/lexical_unit.h +++ b/apertium/lexical_unit.h @@ -17,8 +17,6 @@ #define TAGGING_EXPRESSION_H #include "analysis.h" - -#include #include namespace Apertium { diff --git a/apertium/morpheme.cc b/apertium/morpheme.cc index 2de0d8b..5db5657 100644 --- a/apertium/morpheme.cc +++ b/apertium/morpheme.cc @@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const Morpheme &morph) { ::operator<<(out, morph.TheLemma); for (auto& it : morph.TheTags) { out << '<'; - ::operator<<(out, it.TheTag); + ::operator<<(out, it); out << '>'; } // namespace issue @@ -54,9 +54,57 @@ Morpheme::operator UString() const { UString ustring_ = TheLemma; for (auto& Tag_ : TheTags) { - ustring_ += static_cast(Tag_); + ustring_ += '<'; + ustring_ += Tag_; + ustring_ += '>'; } return ustring_; } + +void +Morpheme::read(InputFile& in) +{ + UChar32 c = in.get(); + while (c != '<' && c != '$' && c != '/' && c != '\0' && c != '+') { + TheLemma += c; + if (c == '\\') { + if (in.eof() || in.peek() == '\0') { + throw Exception::Stream::UnexpectedEndOfFile("Unterminted lexical unit"); + } + TheLemma += in.get(); + } + c = in.get(); + } + if (TheLemma.empty()) { + throw Exception::Morpheme::TheLemma_empty("empty lemma"); + } + while (c == '<') { + UString tg = in.readBlock('<', '>'); + if (tg.size() == 2) { + throw Exception::Morpheme::TheTags_empty("invalid tag <>"); + } + TheTags.push_back(tg.substr(1, tg.size()-2)); + c = in.get(); + } + if (TheTags.empty()) { + throw Exception::Morpheme::TheTags_empty("morpheme has no tags"); + } + if (c == '#') { + while (c != '<' && c != '$' && c != '/' && c != '\0' && c != '+') { + TheLemma += c; + if (c == '\\') { + if (in.eof() || in.peek() == '\0') { + throw Exception::Stream::UnexpectedEndOfFile("trailing backslash"); + } + TheLemma += in.get(); + } + c = in.get(); + } + if (c == '<') { + throw Exception::Stream::UnexpectedCharacter("unexpected < after lemma queue"); + } + } + in.unget(c); +} } diff --git a/apertium/morpheme.h b/apertium/morpheme.h index eb2c3d2..22bdc13 100644 --- a/apertium/morpheme.h +++ b/apertium/morpheme.h @@ -16,9 +16,8 @@ #ifndef MORPHEME_H #define MORPHEME_H -#include "tag.h" - -#include +#include +#include #include #include @@ -29,8 +28,9 @@ public: friend bool operator<(const Morpheme &a, const Morpheme &b); friend std::ostream& operator<<(std::ostream& out, const Morpheme &morph); operator UString() const; + void read(InputFile& in); UString TheLemma; - std::vector TheTags; + std::vector TheTags; }; } diff --git a/apertium/perceptron_spec.cc b/apertium/perceptron_spec.cc index c785899..0c0161f 100644 --- a/apertium/perceptron_spec.cc +++ b/apertium/perceptron_spec.cc @@ -61,9 +61,7 @@ static Morpheme make_sentinel_wordoid( const UString &tag_str) { Morpheme morpheme; morpheme.TheLemma = lemma_str; - Tag tag; - tag.TheTag = tag_str; - morpheme.TheTags.push_back(tag); + morpheme.TheTags.push_back(tag_str); return morpheme; } @@ -520,13 +518,7 @@ PerceptronSpec::Machine::execCommonOp(Opcode op) stack.push(ambgset); } break; case EXTAGS: { - const std::vector &tags = stack.top().wrd().TheTags; - /*std::vector::const_iterator it = tags.begin(); - std::cerr << "tags: "; - for (;it != tags.end(); it++) { - std::cerr << &(*it) << " " << it->TheTag << ", "; - } - std::cerr << "\n";*/ + const std::vector &tags = stack.top().wrd().TheTags; std::vector *tags_str = new std::vector; tags_str->resize(tags.size()); transform(tags.begin(), tags.end(), tags_str->begin(), get_tag); @@ -770,9 +762,9 @@ void PerceptronSpec::appendStr(UnaryFeatureVec::iterator begin, } std::string -PerceptronSpec::Machine::get_tag(const Tag &in) { +PerceptronSpec::Machine::get_tag(const UString &in) { std::string result; - utf8::utf16to8(in.TheTag.begin(), in.TheTag.end(), std::back_inserter(result)); + utf8::utf16to8(in.begin(), in.end(), std::back_inserter(result)); return result; } diff --git a/apertium/perceptron_spec.h b/apertium/perceptron_spec.h index d51a507..5b8e986 100644 --- a/apertium/perceptron_spec.h +++ b/apertium/perceptron_spec.h @@ -267,7 +267,7 @@ public: } StackValue(const Morpheme &wordoid) { /*std::cerr << "Before "; - std::vector::const_iterator it = wordoid.TheTags.begin(); + std::vector::const_iterator it = wordoid.TheTags.begin(); for (;it != wordoid.TheTags.end(); it++) { std::cerr << &(*it) << " "; } @@ -459,7 +459,7 @@ private: int get_int_operand(); unsigned int get_uint_operand(); const std::string& get_str_operand(); - static std::string get_tag(const Tag &in); + static std::string get_tag(const UString &in); bool execCommonOp(Opcode op); public: void traceMachineState(); diff --git a/apertium/sentence_stream.cc b/apertium/sentence_stream.cc index a90e56e..4b5e16b 100644 --- a/apertium/sentence_stream.cc +++ b/apertium/sentence_stream.cc @@ -16,15 +16,11 @@ bool isSentenceEnd(StreamedType &token) { if (morphemes.size() != 1) { return false; } - std::vector &tags = morphemes.begin()->TheTags; + std::vector &tags = morphemes.begin()->TheTags; if (tags.size() != 1) { return false; } - Tag &tag = *tags.begin(); - if (tag.TheTag != "sent"_u) { - return false; - } - return true; + return (*tags.begin() == "sent"_u); } bool isSentenceEnd(StreamedType tok, Stream &in, bool sent_seg) { diff --git a/apertium/serialiser.h b/apertium/serialiser.h index 03a939f..4c586b6 100644 --- a/apertium/serialiser.h +++ b/apertium/serialiser.h @@ -21,7 +21,6 @@ #include "i.h" #include "lemma.h" #include "morpheme.h" -#include "tag.h" #include "apertium_config.h" #include @@ -66,12 +65,6 @@ public: std::ostream &Output); }; -template <> class Serialiser { -public: - inline static void serialise(const Tag &SerialisedType_, - std::ostream &Output); -}; - } void Serialiser::serialise(const a &SerialisedType_, std::ostream &Output) { @@ -99,11 +92,6 @@ void Serialiser::serialise(const Morpheme &SerialisedType_, ::serialise(SerialisedType_.TheTags, Output); } -void Serialiser::serialise(const Tag &SerialisedType_, - std::ostream &Output) { - ::serialise(SerialisedType_.TheTag, Output); -} - // [1] operator+ promotes its operand to a printable integral type. #endif // SERIALISER_H diff --git a/apertium/stream.cc b/apertium/stream.cc index 3d8897b..6a2b50f 100644 --- a/apertium/stream.cc +++ b/apertium/stream.cc @@ -15,600 +15,71 @@ #include "stream.h" -#include "analysis.h" #include "exception.h" -#include -#include -#include -#include - namespace Apertium { Stream::Stream(TaggerFlags &Flags_) - : TheLineNumber(1), TheCharacterStream(std::cin), TheFilename(), TheLine(), - TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} - -Stream::Stream(TaggerFlags &Flags_, - std::ifstream &CharacterStream_, const char *const Filename_) - : TheLineNumber(1), TheCharacterStream(CharacterStream_), TheFilename(Filename_), - TheLine(), TheFlags(Flags_), private_flush_(false), - ThePreviousCase() {} - -Stream::Stream(TaggerFlags &Flags_, - std::ifstream &CharacterStream_, const std::string &Filename_) - : TheLineNumber(1), TheCharacterStream(CharacterStream_), TheFilename(Filename_), - TheLine(), TheFlags(Flags_), private_flush_(false), - ThePreviousCase() {} + : TheFlags(Flags_) {} -Stream::Stream(TaggerFlags &Flags_, - std::ifstream &CharacterStream_, - const std::stringstream &Filename_) - : TheLineNumber(1), TheCharacterStream(CharacterStream_), TheFilename(Filename_.str()), - TheLine(), TheFlags(Flags_), private_flush_(false), - ThePreviousCase() {} +Stream::Stream(TaggerFlags &Flags_, const char *const Filename_) + : TheFlags(Flags_) +{ + TheCharacterStream.open_or_exit(Filename_); +} StreamedType Stream::get() { StreamedType TheStreamedType; - UString Lemma; private_flush_ = false; - //TheCharacterStream.clear(); - if (!is_eof_throw_if_not_TheCharacterStream_good()) { - while (true) { - const UChar Character_ = TheCharacterStream.get(); - - if (is_eof_throw_if_not_TheCharacterStream_good(TheStreamedType, Lemma, - Character_)) - break; - - TheLine.push_back(Character_); - - switch (Character_) { - case '\\': // <\> 92, Hex 5c, Octal 134 - case_0x5c(TheStreamedType, Lemma, Character_); - continue; - case '[': - if (ThePreviousCase) { - switch (ThePreviousCase->ThePreviousCase) { - case '[': - case ']': - case '$': - break; - default: - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', '[' expected to follow '[', ']' or '$'"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - } - - push_back_Character(TheStreamedType, Lemma, Character_); - ThePreviousCase = PreviousCaseType(Character_); - continue; - case ']': - if (!ThePreviousCase) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "', ']' expected to follow '['"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - switch (ThePreviousCase->ThePreviousCase) { - case '[': - case ']': - push_back_Character(TheStreamedType, Lemma, Character_); - ThePreviousCase = PreviousCaseType(Character_); - continue; - default: - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', ']' expected to follow '[' or ']'"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - std::abort(); - case '^': - if (ThePreviousCase) { - switch (ThePreviousCase->ThePreviousCase) { - case '[': - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case ']': - case '$': - break; - default: - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', '^' expected to follow '[', ']', or '$'"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - } - - TheStreamedType.TheLexicalUnit = LexicalUnit(); - ThePreviousCase = PreviousCaseType(Character_); - continue; - case '/': - if (!ThePreviousCase) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "', '/' expected to follow '[', to follow '>' " - "immediately, or to follow '^' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - switch (ThePreviousCase->ThePreviousCase) { - case '[': - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case '^': - if (ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '/' expected to follow '[', to follow '>' " - "immediately, or to follow '^' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - ThePreviousCase = PreviousCaseType(Character_); - - { - const UChar Character_ = TheCharacterStream.get(); - - if (is_eof_throw_if_not_TheCharacterStream_good( - TheStreamedType, Lemma, Character_)) { - std::stringstream Message; - Message << "unexpected end-of-file following '" - << ThePreviousCase->ThePreviousCase - << "', end-of-file expected to follow ']' or '$'"; - throw Exception::Stream::UnexpectedEndOfFile( - Message_what(Message)); - } - - TheLine.push_back(Character_); - - switch (Character_) { - case '\\': - TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); - TheStreamedType.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.push_back(Morpheme()); - case_0x5c(TheStreamedType, Lemma, Character_); - continue; - case '*': - ThePreviousCase = PreviousCaseType(Character_); - continue; - case '\n': { - std::stringstream Message; - Message << "unexpected newline following '" - << ThePreviousCase->ThePreviousCase - << "', newline expected to follow '[', ']', or '$'"; - throw Exception::Stream::UnexpectedCharacter( - Message_what(Message)); - }; - case '<': - TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); - TheStreamedType.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.push_back(Morpheme()); - TheStreamedType.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.back() - .TheTags.push_back(Tag()); - ThePreviousCase = PreviousCaseType(Character_); - continue; - - case '[': - case ']': - case '^': - case '#': - case '>': - case '+': - case '$': { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' immediately following '" - << ThePreviousCase->ThePreviousCase << "', expected '*'"; - throw Exception::Stream::UnexpectedPreviousCase( - Message_what(Message)); - } - default: - TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); - TheStreamedType.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.push_back(Morpheme()); - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - } - } - - continue; - case '>': - if (!ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' not immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '/' expected to follow '[', to follow '>' " - "immediately, or to follow '^' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - break; - case '#': - - if (ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '/' expected to follow '[', to follow '>' " - "immediately, or to follow '^' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - break; - default: - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', '/' expected to follow '[', to follow '>' " - "immediately, or to follow '^' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } + TheStreamedType.TheString = TheCharacterStream.readBlank(true); + if (!TheCharacterStream.eof() && TheCharacterStream.peek() == '^') { + TheCharacterStream.get(); + TheStreamedType.TheLexicalUnit = LexicalUnit(); + UChar32 c = TheCharacterStream.get(); + while (c != '/' && c != '$') { + TheStreamedType.TheLexicalUnit->TheSurfaceForm += c; + c = TheCharacterStream.get(); + } + if (c == '$') { + throw Exception::Analysis::TheMorphemes_empty("lexical unit has no analyses"); + } else if (TheStreamedType.TheLexicalUnit->TheSurfaceForm.empty()) { + throw Exception::Stream::UnexpectedCharacter("unexpected /, surface form is empty"); + } + c = TheCharacterStream.get(); + if (c == '$') { + throw Exception::Analysis::TheMorphemes_empty("lexical unit has no analyses"); + } else if (c == '*') { + TheCharacterStream.readBlock(c, '$'); + } else { + TheCharacterStream.unget(c); + do { TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); - TheStreamedType.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.push_back(Morpheme()); - ThePreviousCase = PreviousCaseType(Character_); - continue; - case '*': - if (ThePreviousCase) { - switch (ThePreviousCase->ThePreviousCase) { - case '[': - case ']': - case '$': - break; - default: - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', '*' expected to follow '[', ']', or '$' or to " - "follow '/' immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - } - - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case '<': - if (!ThePreviousCase) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "', '<' expected to follow '[', to follow '>' " - "immediately, or to follow '#', '/' or '+' not " - "immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - switch (ThePreviousCase->ThePreviousCase) { - case '[': - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case '/': - break; - case '#': - //std::cerr << "[306] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; - case '+': - if (ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '<' expected to follow '[', '/', '>'" - "immediately, or to follow '#' or '+' not " - "immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - break; - case '>': - break; - default: - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', '<' expected to follow '[', to follow '>' " - "immediately, or to follow '#', '/' or '+' not " - "immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - TheStreamedType.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.back() - .TheTags.push_back(Tag()); - ThePreviousCase = PreviousCaseType(Character_); - continue; - case '>': - if (!ThePreviousCase) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "', '>' expected to follow '[' or to follow '<' not " - "immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - switch (ThePreviousCase->ThePreviousCase) { - case '[': - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case '<': - if (ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '>' expected to follow '[' or to follow '<' not " - "immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - ThePreviousCase = PreviousCaseType(Character_); - continue; - default: - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', '>' expected to follow '[' or to follow '<' not " - "immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - std::abort(); - case '#': - //std::cerr << "[391] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; - if (ThePreviousCase) { - switch (ThePreviousCase->ThePreviousCase) { - case '[': - case ']': - case '^': - case '$': - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case '/': - if (ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '#' expected to follow '[', ']', or '$', to " - "follow '>' immediately, or to follow '/' not " - "immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - break; - case '>': - if (!ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' not immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '#' expected to follow '[', ']', or '$', to " - "follow '>' immediately, or to follow '/' not " - "immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - break; - default: - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', '#' expected to follow '[', ']', or '$', to follow " - "'>' immediately, or to follow '/' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - ThePreviousCase = PreviousCaseType(Character_); - push_back_Character(TheStreamedType, Lemma, Character_); - //std::cerr << "[440] Character: " << Character_ << "||| Lemma: " << Lemma << std::endl ; - continue; - } - - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case '+': - if (ThePreviousCase) { - switch (ThePreviousCase->ThePreviousCase) { - case '[': - case ']': - case '^': - case '/': - case '$': - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case '>': - if (!ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' not immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '+' expected to follow '[', ']', '^', '/' or " - "'$', to follow '>' immediately, or to follow '#' " - "not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - break; - case '#': - if (ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '+' expected to follow '[', ']', or '$', to " - "follow '>' immediately, or to follow '#' not " - "immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - break; - default: { - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', '+' expected to follow '[', ']', or '$', to follow " - "'>' immediately, or to follow '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - } - - TheStreamedType.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.push_back(Morpheme()); - ThePreviousCase = PreviousCaseType(Character_); - continue; - } - - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case '$': - if (!ThePreviousCase) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "', '$' expected to follow '[', to follow '>' " - "immediately, or to follow '*' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - switch (ThePreviousCase->ThePreviousCase) { - case '[': - push_back_Character(TheStreamedType, Lemma, Character_); - continue; - case '*': - if (ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '$' expected to follow '[', to follow '>' " - "immediately, or to follow '*' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - if (TheFlags.getDebug()) { - if (Lemma != TheStreamedType.TheLexicalUnit->TheSurfaceForm) - std::cerr << "unexpected lemma \"" << Lemma - << "\", expected \"" - << TheStreamedType.TheLexicalUnit->TheSurfaceForm - << "\"\n"; - } - - ThePreviousCase = PreviousCaseType(Character_); - return TheStreamedType; - case '>': - if (!ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' not immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '$' expected to follow '[', to follow '>' " - "immediately, or to follow '*' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - break; - case '#': - if (ThePreviousCase->isPreviousCharacter) { - std::stringstream Message; - Message << "unexpected '" << Character_ - << "' immediately following '" - << ThePreviousCase->ThePreviousCase - << "', '$' expected to follow '[', to follow '>' " - "immediately, or to follow '*' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - break; - default: - std::stringstream Message; - Message << "unexpected '" << Character_ << "' following '" - << ThePreviousCase->ThePreviousCase - << "', '$' expected to follow '[', to follow '>' " - "immediately, or to follow '*' or '#' not immediately"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - - ThePreviousCase = PreviousCaseType(Character_); - return TheStreamedType; - case '\n': - if (ThePreviousCase) { - switch (ThePreviousCase->ThePreviousCase) { - case '[': - case ']': - case '$': - break; - default: - std::stringstream Message; - Message << "unexpected newline following '" - << ThePreviousCase->ThePreviousCase - << "', newline expected to follow '[', ']', or '$'"; - throw Exception::Stream::UnexpectedCase(Message_what(Message)); - } - } - - push_back_Character(TheStreamedType, Lemma, Character_); - ++TheLineNumber; - TheLine.clear(); - continue; - default: - push_back_Character(TheStreamedType, Lemma, Character_); - continue; + TheStreamedType.TheLexicalUnit->TheAnalyses.back().read(TheCharacterStream); + c = TheCharacterStream.get(); + } while (c == '/'); + if (c != '$') { + throw Exception::Stream::UnexpectedEndOfFile("unterminated lexical unit"); } - - std::abort(); } } - if (ThePreviousCase) { - switch (ThePreviousCase->ThePreviousCase) { - case ']': - case '$': - break; - default: - std::stringstream Message; - Message << "unexpected end-of-file following '" - << ThePreviousCase->ThePreviousCase - << "', end-of-file expected to follow ']' or '$'"; - throw Exception::Stream::UnexpectedEndOfFile(Message_what(Message)); - } + if (TheCharacterStream.peek() == '\0') { + TheCharacterStream.get(); + private_flush_ = true; } return TheStreamedType; } -StreamedType Stream::peek() { - bool prev_flush = private_flush_; - std::ios::iostate state = TheCharacterStream.rdstate(); - int pos = TheCharacterStream.tellg(); - - StreamedType token = get(); - - TheCharacterStream.clear(state); - TheCharacterStream.seekg(pos); - private_flush_ = prev_flush; - return token; -} - bool Stream::peekIsBlank() { - std::ios::iostate state = TheCharacterStream.rdstate(); - int pos = TheCharacterStream.tellg(); - - const UChar newline1 = TheCharacterStream.get(); - const UChar newline2 = TheCharacterStream.get(); + const UChar32 newline1 = TheCharacterStream.get(); + const UChar32 newline2 = TheCharacterStream.get(); - TheCharacterStream.clear(state); - TheCharacterStream.seekg(pos); + // somewhat dangerous to unget twice + // but InputFile does have a 3 char buffer + TheCharacterStream.unget(newline2); + TheCharacterStream.unget(newline1); return newline1 == '\n' && newline2 == '\n'; } @@ -629,9 +100,8 @@ void Stream::outputLexicalUnit( return; } - if (flags.getMark()) { - if (lexical_unit.TheAnalyses.size() != 1) - output << "="; + if (flags.getMark() && lexical_unit.TheAnalyses.size() != 1) { + output << "="; } if (flags.getShowSuperficial()) @@ -640,162 +110,12 @@ void Stream::outputLexicalUnit( output << *analysis; if (flags.getFirst()) { - for (std::vector::const_iterator other_analysis = - lexical_unit.TheAnalyses.begin(); - // Call .end() each iteration to save memory. - other_analysis != lexical_unit.TheAnalyses.end(); ++other_analysis) { - if (*other_analysis != *analysis) - output << "/" << *other_analysis; + for (auto& other_analysis : lexical_unit.TheAnalyses) { + if (other_analysis != *analysis) + output << "/" << other_analysis; } } output << "$"; } - -Stream::PreviousCaseType::PreviousCaseType(const UChar &PreviousCase_) - : ThePreviousCase(PreviousCase_), isPreviousCharacter(true) {} - -bool Stream::is_eof_throw_if_not_TheCharacterStream_good() const { - if (TheCharacterStream.eof()) - return true; - - if (!TheCharacterStream) { - std::cerr << "State bad " << TheCharacterStream.good() << " " - << TheCharacterStream.eof() << " " - << TheCharacterStream.fail() << " " - << TheCharacterStream.bad() << "\n"; - std::stringstream Message; - Message << "can't get const UChar: TheCharacterStream not good"; - throw Exception::Stream::TheCharacterStream_not_good( - Message_what(Message)); - } - - return false; -} - -UString Stream::Message_what(const std::stringstream &Message) const { - std::stringstream what_; - - if (TheFilename) - what_ << UString(TheFilename->begin(), TheFilename->end()) << ": "; - - what_ << TheLineNumber << ":" << TheLine.size() << ": " << Message.str() - << '\n' << TheLine << '\n' << UString(TheLine.size() - 1, ' ') - << '^'; - return to_ustring(what_.str().c_str()); -} - -bool -Stream::is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, - UString &Lemma, - const UChar &Character_) { - if (isTheCharacterStream_eof(StreamedType_, Lemma, Character_)) - return true; - - if (!TheCharacterStream) { - std::stringstream Message; - Message << "can't get const UChar: TheCharacterStream not good"; - throw Exception::Stream::TheCharacterStream_not_good( - Message_what(Message)); - } - - return false; -} - -bool Stream::isTheCharacterStream_eof(StreamedType &StreamedType_, - UString &Lemma, - const UChar &Character_) { - if (TheCharacterStream.eof()) - return true; - - if (TheFlags.getNullFlush()) { - if (Character_ == '\0') { - push_back_Character(StreamedType_, Lemma, Character_); - private_flush_ = true; - return true; - } - } - - return false; -} - -void Stream::push_back_Character(StreamedType &StreamedType_, - UString &Lemma, - const UChar &Character_) { - if (ThePreviousCase) { - switch (ThePreviousCase->ThePreviousCase) { - case '[': - StreamedType_.TheString += Character_; - break; - case ']': - StreamedType_.TheString += Character_; - break; - case '^': - StreamedType_.TheLexicalUnit->TheSurfaceForm += Character_; - break; - case '/': - StreamedType_.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.back() - .TheLemma.push_back(Character_); - break; - case '*': - Lemma += Character_; - break; - case '<': - StreamedType_.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.back() - .TheTags.back() - .TheTag += Character_; - break; - case '>': - StreamedType_.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.back() - .TheLemma.push_back(Character_); - break; - case '#': - StreamedType_.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.back() - .TheLemma.push_back(Character_); - break; - case '+': - StreamedType_.TheLexicalUnit->TheAnalyses.back() - .TheMorphemes.back() - .TheLemma.push_back(Character_); - break; - case '$': - StreamedType_.TheString += Character_; - break; - default: - std::stringstream Message; - Message << "unexpected previous reserved or special character '" - << ThePreviousCase->ThePreviousCase << "'"; - throw Exception::Stream::UnexpectedPreviousCase(Message_what(Message)); - } - - ThePreviousCase->isPreviousCharacter = false; - return; - } - - StreamedType_.TheString += Character_; -} - -void Stream::case_0x5c(StreamedType &StreamedType_, UString &Lemma, - const UChar &Character_) { - push_back_Character(StreamedType_, Lemma, Character_); - - { - const UChar Character_ = TheCharacterStream.get(); - - if (is_eof_throw_if_not_TheCharacterStream_good(StreamedType_, Lemma, - Character_)) { - std::stringstream Message; - Message << "unexpected end-of-file following '\\', end-of-file " - "expected to follow ']' or '$'"; - throw Exception::Stream::UnexpectedEndOfFile(Message_what(Message)); - } - - TheLine.push_back(Character_); - push_back_Character(StreamedType_, Lemma, Character_); - } -} } diff --git a/apertium/stream.h b/apertium/stream.h index 69c266a..07c2a65 100644 --- a/apertium/stream.h +++ b/apertium/stream.h @@ -19,57 +19,29 @@ #include "tagger_flags.h" #include "optional.h" #include "streamed_type.h" +#include "analysis.h" + +#include #include -#include #include -#include -#include namespace Apertium { class Stream { public: - Stream(TaggerFlags &Flags_); - Stream(TaggerFlags &Flags_, std::ifstream &CharacterStream_, - const char *const Filename_); - Stream(TaggerFlags &Flags_, std::ifstream &CharacterStream_, - const std::string &Filename_); - Stream(TaggerFlags &Flags_, std::ifstream &CharacterStream_, - const std::stringstream &Filename_); + Stream(TaggerFlags& Flags_); + Stream(TaggerFlags &Flags_, const char *const Filename_); StreamedType get(); - StreamedType peek(); bool peekIsBlank(); bool flush_() const; static void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, std::ostream &output, TaggerFlags &flags); - - std::size_t TheLineNumber; private: - class PreviousCaseType { - public: - PreviousCaseType(const UChar &PreviousCase_); - UChar ThePreviousCase; - bool isPreviousCharacter : 1; - }; - bool is_eof_throw_if_not_TheCharacterStream_good() const; - UString Message_what(const std::stringstream &Message) const; - bool is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, - UString &Lemma, - const UChar &Character_); - bool isTheCharacterStream_eof(StreamedType &StreamedType_, - UString &Lemma, const UChar &Character_); - void push_back_Character(StreamedType &StreamedType_, UString &Lemma, - const UChar &Character_); - void case_0x5c(StreamedType &StreamedType_, UString &Lemma, - const UChar &Character_); - std::istream &TheCharacterStream; - Optional TheFilename; - UString TheLine; + bool private_flush_ = false; + InputFile TheCharacterStream; TaggerFlags &TheFlags; - bool private_flush_ : 1; - Optional ThePreviousCase; }; } diff --git a/apertium/tag.cc b/apertium/tag.cc deleted file mode 100644 index 2983f94..0000000 --- a/apertium/tag.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License as -// published by the Free Software Foundation; either version 2 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, but -// WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, see . - -#include "tag.h" - -#include "exception.h" - -#include - -namespace Apertium { -bool operator==(const Tag &a, const Tag &b) { return a.TheTag == b.TheTag; } - -bool operator<(const Tag &a, const Tag &b) { return a.TheTag < b.TheTag; } - -Tag::operator UString() const { - if (TheTag.empty()) - throw Exception::Tag::TheTags_empty("can't convert Tag comprising empty " - "TheTag UString to UString"); - - UString ret; - ret.reserve(TheTag.size() + 2); - ret += '<'; - ret.append(TheTag); - ret += '>'; - return ret; -} -} diff --git a/apertium/tag.h b/apertium/tag.h deleted file mode 100644 index 62698e8..0000000 --- a/apertium/tag.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante -// -// This program is free software; you can redistribute it and/or -// modify it under the terms of the GNU General Public License as -// published by the Free Software Foundation; either version 2 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, but -// WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, see . - -#ifndef TAG_H -#define TAG_H - -#include - -namespace Apertium { -class Tag { -public: - friend bool operator==(const Tag &a, const Tag &b); - friend bool operator<(const Tag &a, const Tag &b); - operator UString() const; - UString TheTag; -}; -} - -#endif // TAG_H diff --git a/apertium/tagger.cc b/apertium/tagger.cc index 6e8372c..f068586 100644 --- a/apertium/tagger.cc +++ b/apertium/tagger.cc @@ -577,25 +577,23 @@ void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { << "\" Reason: " << ExceptionType_.what(); throw Exception::apertium_tagger::deserialise(what_); } + if (nonoptarg < 2) { Stream Input(TheFlags); StreamTagger_.tag(Input, std::cout); return; } - std::ifstream Input_stream; - try_open_fstream("INPUT", argv[optind + 1], Input_stream); + Stream Input(TheFlags, argv[optind + 1]); if (nonoptarg < 3) { - Stream Input(TheFlags, Input_stream, argv[optind + 1]); StreamTagger_.tag(Input, std::cout); return; } std::ofstream Output_stream; - try_open_fstream("OUTPUT", argv[optind + 2], Input_stream); + try_open_fstream("OUTPUT", argv[optind + 2], Output_stream); - Stream Input(TheFlags, Input_stream, argv[optind + 1]); StreamTagger_.tag(Input, Output_stream); } @@ -616,14 +614,10 @@ void apertium_tagger::s_StreamTaggerTrainer( expect_file_arguments(nonoptarg, 2); } - std::ifstream TaggedCorpus_stream; - try_open_fstream("TAGGED_CORPUS", argv[optind + 1], TaggedCorpus_stream); - Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind + 1]); + Stream TaggedCorpus(TheFlags, argv[optind + 1]); if (*TheFunctionTypeType == Perceptron) { - std::ifstream UntaggedCorpus_stream; - try_open_fstream("UNTAGGED_CORPUS", argv[optind + 2], UntaggedCorpus_stream); - Stream UntaggedCorpus(TheFlags, UntaggedCorpus_stream, argv[optind + 2]); + Stream UntaggedCorpus(TheFlags, argv[optind + 2]); PerceptronTagger &pt = dynamic_cast(StreamTaggerTrainer_); pt.read_spec(argv[optind + 3]); diff --git a/apertium/tagger_data_percep_coarse_tags.cc b/apertium/tagger_data_percep_coarse_tags.cc index 89ce084..95da0a6 100644 --- a/apertium/tagger_data_percep_coarse_tags.cc +++ b/apertium/tagger_data_percep_coarse_tags.cc @@ -63,7 +63,7 @@ const UString& TaggerDataPercepCoarseTags::coarsen(const Apertium::Morpheme &wrd for (size_t i = 0; i < wrd.TheTags.size(); i++) { UString tag; tag += '<'; - tag.append(wrd.TheTags[i].TheTag); + tag.append(wrd.TheTags[i]); tag += '>'; int symbol = alphabet(tag); if (symbol) {