Index: trunk/apertium/apertium/apertium_tagger.cc =================================================================== --- trunk/apertium/apertium/apertium_tagger.cc (revision 69612) +++ trunk/apertium/apertium/apertium_tagger.cc (revision 69619) @@ -22,12 +22,10 @@ #include "basic_stream_tagger.h" #include "basic_stream_tagger_trainer.h" #include "basic_tagger.h" +#include "err_exception.h" #include "exception.h" #include "file_tagger.h" -#include "err_exception.h" -#include #include "linebreak.h" -#include #include "stream_5_3_1_tagger.h" #include "stream_5_3_1_tagger_trainer.h" #include "stream_5_3_2_tagger.h" @@ -34,16 +32,18 @@ #include "stream_5_3_2_tagger_trainer.h" #include "stream_5_3_3_tagger.h" #include "stream_5_3_3_tagger_trainer.h" +#include +#include #include #include +#include "getopt_long.h" #include #include #include #include #include -#include "getopt_long.h" #include #include #include @@ -61,28 +61,20 @@ apertium_tagger::apertium_tagger(int &argc, char **&argv) : argc(argc), argv(argv), The_val(), - The_indexptr(), FunctionTypeTypeOption_indexptr(), FunctionTypeOption_indexptr(), - TheFunctionTypeType(), TheUnigramType(), TheFunctionType(), TheFunctionTypeOptionArgument(0), TheFlags() { try { while (true) { - The_val = + The_val = getopt_long(argc, argv, "dfgmpr:s:t:u:wz", longopts, &The_indexptr); - - getopt_long(argc, argv, "dfgmpr:s:t:u:wz", longopts, &The_indexptr); - - if (The_val == -1) break; - set_indexptr(); - switch (The_val) { case 'd': flagOptionCase(&basic_Tagger::Flags::getDebug, @@ -125,10 +117,10 @@ { std::stringstream what_; what_ << "invalid argument '" << optarg << "' for '--unigram'\n" -"Valid arguments are:\n" -" - '1'\n" -" - '2'\n" -" - '3'"; + "Valid arguments are:\n" + " - '1'\n" + " - '2'\n" + " - '3'"; throw Exception::apertium_tagger::InvalidArgument(what_); } break; @@ -291,25 +283,25 @@ void apertium_tagger::help() { std::cerr << -"Usage: apertium-tagger [OPTION]... -g SERIALISED_BASIC_TAGGER \\\n" +"Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" " [INPUT \\\n" " [OUTPUT]]\n" "\n" " or: apertium-tagger [OPTION]... -r ITERATIONS \\\n" " CORPUS \\\n" -" SERIALISED_BASIC_TAGGER\n" +" SERIALISED_TAGGER\n" "\n" " or: apertium-tagger [OPTION]... -s ITERATIONS \\\n" " DICTIONARY \\\n" " CORPUS \\\n" " TAGGER_SPECIFICATION \\\n" -" SERIALISED_BASIC_TAGGER \\\n" +" SERIALISED_TAGGER \\\n" " TAGGED_CORPUS \\\n" " UNTAGGED_CORPUS\n" "\n" " or: apertium-tagger [OPTION]... -s 0 \\\n" " -u MODEL \\\n" -" SERIALISED_BASIC_TAGGER \\\n" +" SERIALISED_TAGGER \\\n" " TAGGED_CORPUS\n" "\n" " or: apertium-tagger [OPTION]... -t ITERATIONS \\\n" @@ -316,7 +308,7 @@ " DICTIONARY \\\n" " CORPUS \\\n" " TAGGER_SPECIFICATION \\\n" -" SERIALISED_BASIC_TAGGER\n" +" SERIALISED_TAGGER\n" "\n" "\n" "Mandatory arguments to long options are mandatory for short options too.\n" @@ -353,7 +345,6 @@ align::align_(options_description_); } - std::string apertium_tagger::option_string(const int &indexptr_) { return option_string(longopts[indexptr_]); } @@ -364,7 +355,6 @@ return option_string_.str(); } - void apertium_tagger::locale_global_() { #if defined __clang__ @@ -384,7 +374,6 @@ #endif // defined __clang__ } - const struct option apertium_tagger::longopts[] = { {"help", no_argument, 0, 'h'}, {"debug", no_argument, 0, 'd'}, @@ -400,8 +389,6 @@ {"train", required_argument, 0, 't'}, {0, 0, 0, 0}}; - - void apertium_tagger::set_indexptr() { if (The_val == longopts[The_indexptr].val) return; @@ -415,7 +402,6 @@ } } - void apertium_tagger::flagOptionCase( bool (basic_Tagger::Flags::*GetFlag)() const, void (basic_Tagger::Flags::*SetFlag)(const bool &)) { @@ -430,66 +416,35 @@ } std::string apertium_tagger::option_string() { - - return option_string(The_indexptr); - } void apertium_tagger::functionTypeTypeOptionCase( const FunctionTypeType &FunctionTypeType_) { - if ( - - - FunctionTypeTypeOption_indexptr - - - ) { + if (FunctionTypeTypeOption_indexptr) { std::stringstream what_; what_ << "unexpected '" << option_string() << "' following '" - << option_string( - - - *FunctionTypeTypeOption_indexptr - - - ) << '\''; + << option_string(*FunctionTypeTypeOption_indexptr) + << '\''; throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_); } TheFunctionTypeType = FunctionTypeType_; - - FunctionTypeTypeOption_indexptr = The_indexptr; - } -void -apertium_tagger::functionTypeOptionCase(const FunctionType &FunctionType_) { - if ( - - - FunctionTypeOption_indexptr - - - ) { +void apertium_tagger::functionTypeOptionCase( + const FunctionType &FunctionType_) { + if (FunctionTypeOption_indexptr) { std::stringstream what_; what_ << "unexpected '" << option_string() << "' following '" - << option_string( - - - *FunctionTypeOption_indexptr - - - ) << '\''; + << option_string(*FunctionTypeOption_indexptr) + << '\''; throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_); } TheFunctionType = FunctionType_; - - FunctionTypeOption_indexptr = The_indexptr; - } void apertium_tagger::getIterationsArgument() { @@ -530,6 +485,45 @@ return N_0; } +template +static void try_open_fstream(const char *metavar, const char *filename, + T &stream) { + stream.open(filename); + if (stream.fail()) { + std::stringstream what_; + what_ << "can't open " << metavar << " file \"" << filename << "\""; + throw Exception::apertium_tagger::open_stream_fail(what_); + } +} + +static FILE *try_open_file(const char *metavar, const char *filename, + const char *flags) { + FILE *f = std::fopen(filename, flags); + if (f == NULL) { + std::stringstream what_; + what_ << "can't open " << metavar << " file \"" << filename << "\""; + throw Exception::apertium_tagger::fopen(what_); + } + return f; +} + +static inline FILE *try_open_file_utf8(const char *metavar, const char *filename, + const char *flags) { + FILE *f = try_open_file(metavar, filename, flags); +#ifdef _MSC_VER + _setmode(_fileno(f), _O_U8TEXT); +#endif // _MSC_VER + return f; +} + +static void try_close_file(const char *metavar, const char *filename, FILE *file) { + if (std::fclose(file) != 0) { + std::stringstream what_; + what_ << "can't close " << metavar << " file \"" << filename << "\""; + throw Exception::apertium_tagger::fclose(what_); + } +} + void apertium_tagger::g_StreamTagger(basic_StreamTagger &StreamTagger_) { locale_global_(); @@ -539,20 +533,15 @@ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); } - std::ifstream SerialisedAnalysisFrequencies(argv[optind]); + std::ifstream SerialisedAnalysisFrequencies; + try_open_fstream("SERIALISED_TAGGER", argv[optind], + SerialisedAnalysisFrequencies); - if (SerialisedAnalysisFrequencies.fail()) { - std::stringstream what_; - what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind] - << "\""; - throw Exception::apertium_tagger::ifstream_fail(what_); - } - try { StreamTagger_.deserialise(SerialisedAnalysisFrequencies); } catch (const basic_ExceptionType &basic_ExceptionType_) { std::stringstream what_; - what_ << "can't deserialise SERIALISED_BASIC_TAGGER file \"" << argv[optind] + what_ << "can't deserialise SERIALISED_TAGGER file \"" << argv[optind] << "\" Reason: " << basic_ExceptionType_.what(); throw Exception::apertium_tagger::deserialise(what_); } @@ -563,14 +552,9 @@ return; } - std::wifstream Input_stream(argv[optind + 1]); + std::wifstream Input_stream; + try_open_fstream("INPUT", argv[optind + 1], Input_stream); - if (Input_stream.fail()) { - std::stringstream what_; - what_ << "can't open INPUT file \"" << argv[optind + 1] << "\""; - throw Exception::apertium_tagger::wifstream_fail(what_); - } - if (argc - optind < 3) { Stream Input(TheFlags, Input_stream, argv[optind + 1]); StreamTagger_.tag(Input, std::wcout); @@ -577,14 +561,9 @@ return; } - std::wofstream Output_stream(argv[optind + 2]); + std::wofstream Output_stream; + try_open_fstream("OUTPUT", argv[optind + 2], Input_stream); - if (Output_stream.fail()) { - std::stringstream what_; - what_ << "can't open OUTPUT file \"" << argv[optind + 2] << "\""; - throw Exception::apertium_tagger::wofstream_fail(what_); - } - Stream Input(TheFlags, Input_stream, argv[optind + 1]); StreamTagger_.tag(Input, Output_stream); } @@ -606,26 +585,16 @@ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); } - std::wifstream TaggedCorpus_stream(argv[optind + 1]); + std::wifstream TaggedCorpus_stream; + try_open_fstream("TAGGED_CORPUS", argv[optind + 1], TaggedCorpus_stream); - if (TaggedCorpus_stream.fail()) { - std::stringstream what_; - what_ << "can't open TAGGED_CORPUS file \"" << argv[optind + 1] << "\""; - throw Exception::apertium_tagger::wifstream_fail(what_); - } - Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind]); StreamTaggerTrainer_.train(TaggedCorpus); - std::ofstream Serialised_basic_Tagger(argv[optind]); + std::ofstream Serialised_basic_Tagger; + try_open_fstream("SERIALISED_TAGGER", argv[optind], + Serialised_basic_Tagger); - if (Serialised_basic_Tagger.fail()) { - std::stringstream what_; - what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind] - << "\""; - throw Exception::apertium_tagger::ofstream_fail(what_); - } - StreamTaggerTrainer_.serialise(Serialised_basic_Tagger); } @@ -638,24 +607,11 @@ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); } - FILE *Serialised_FILE_Tagger = std::fopen(argv[optind], "rb"); - - if (Serialised_FILE_Tagger == NULL) { - std::stringstream what_; - what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind] - << "\" for reading in binary mode"; - throw Exception::apertium_tagger::fopen(what_); - } - + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind], "rb"); FILE_Tagger_.deserialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); - if (std::fclose(Serialised_FILE_Tagger) != 0) { - std::stringstream what_; - what_ << "can't close SERIALISED_BASIC_TAGGER file \"" << argv[optind] - << "\""; - throw Exception::apertium_tagger::fclose(what_); - } - FILE_Tagger_.set_debug(TheFlags.getDebug()); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); TaggerWord::generate_marks = TheFlags.getMark(); @@ -665,54 +621,18 @@ if (argc - optind < 2) FILE_Tagger_.tagger(stdin, stdout, TheFlags.getFirst()); else { - FILE *Input = std::fopen(argv[optind + 1], "r"); + FILE *Input = try_open_file("INPUT", argv[optind + 1], "r"); - if (Input == NULL) { - std::stringstream what_; - what_ << "can't open INPUT file \"" << argv[optind + 1] - << "\" for reading"; - throw Exception::apertium_tagger::fopen(what_); - } - -#ifdef _MSC_VER - - _setmode(_fileno(Input), _O_U8TEXT); - -#endif // _MSC_VER - if (argc - optind < 3) FILE_Tagger_.tagger(Input, stdout, TheFlags.getFirst()); else { - FILE *Output = std::fopen(argv[optind + 2], "w"); - - if (Output == NULL) { - std::stringstream what_; - what_ << "can't open OUTPUT file \"" << argv[optind + 2] - << "\" for writing"; - throw Exception::apertium_tagger::fopen(what_); - } - -#ifdef _MSC_VER - - _setmode(_fileno(Output), _O_U8TEXT); - -#endif // _MSC_VER - + FILE *Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); FILE_Tagger_.tagger(Input, Output, TheFlags.getFirst()); - - if (std::fclose(Output) != 0) { - std::stringstream what_; - what_ << "can't close OUTPUT file \"" << argv[optind + 2] << "\""; - throw Exception::apertium_tagger::fclose(what_); + try_close_file("OUTPUT", argv[optind + 2], Output); } - } - if (std::fclose(Input) != 0) { - std::stringstream what_; - what_ << "can't close INPUT file \"" << argv[optind + 1] << "\""; - throw Exception::apertium_tagger::fclose(what_); + try_close_file("INPUT", argv[optind + 1], Input); } - } } void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { @@ -724,66 +644,22 @@ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); } - FILE *Serialised_FILE_Tagger = std::fopen(argv[optind + 1], "rb"); - - if (Serialised_FILE_Tagger == NULL) { - std::stringstream what_; - what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1] - << "\" for reading in binary mode"; - throw Exception::apertium_tagger::fopen(what_); - } - + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 1], "rb"); FILE_Tagger_.deserialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 1], Serialised_FILE_Tagger); - if (std::fclose(Serialised_FILE_Tagger) != 0) { - std::stringstream what_; - what_ << "can't close SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1] - << "\""; - throw Exception::apertium_tagger::fclose(what_); - } - FILE_Tagger_.set_debug(TheFlags.getDebug()); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - FILE *Corpus = std::fopen(argv[optind], "r"); - - if (Corpus == NULL) { - std::stringstream what_; - what_ << "can't open CORPUS file \"" << argv[optind] << "\" for reading"; - throw Exception::apertium_tagger::fopen(what_); - } - -#ifdef _MSC_VER - - _setmode(_fileno(Corpus), _O_U8TEXT); - -#endif // _MSC_VER - + FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind], "r"); FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", argv[optind], Corpus); - if (std::fclose(Corpus) != 0) { - std::stringstream what_; - what_ << "can't close CORPUS file \"" << argv[optind] << "\""; - throw Exception::apertium_tagger::fclose(what_); - } - - Serialised_FILE_Tagger = std::fopen(argv[optind + 1], "wb"); - - if (Serialised_FILE_Tagger == NULL) { - std::stringstream what_; - what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1] - << "\" for writing in binary mode"; - throw Exception::apertium_tagger::fopen(what_); - } - + Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 1], "wb"); FILE_Tagger_.serialise(Serialised_FILE_Tagger); - - if (std::fclose(Serialised_FILE_Tagger) != 0) { - std::stringstream what_; - what_ << "can't close SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1] - << "\""; - throw Exception::apertium_tagger::fclose(what_); - } + try_close_file("SERIALISED_TAGGER", argv[optind + 1], Serialised_FILE_Tagger); } void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { @@ -799,102 +675,25 @@ FILE_Tagger_.set_debug(TheFlags.getDebug()); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - FILE *Dictionary = std::fopen(argv[optind], "r"); - - if (Dictionary == NULL) { - std::stringstream what_; - what_ << "can't open DICTIONARY file \"" << argv[optind] - << "\" for reading"; - throw Exception::apertium_tagger::fopen(what_); - } - + FILE *Dictionary = try_open_file("DICTIONARY", argv[optind], "r"); FILE_Tagger_.read_dictionary(Dictionary); + try_close_file("DICTIONARY", argv[optind], Dictionary); - if (std::fclose(Dictionary) != 0) { - std::stringstream what_; - what_ << "can't close DICTIONARY file \"" << argv[optind] << "\""; - throw Exception::apertium_tagger::fclose(what_); - } - - FILE *TaggedCorpus = std::fopen(argv[optind + 4], "r"); - - if (TaggedCorpus == NULL) { - std::stringstream what_; - what_ << "can't open TAGGED_CORPUS file \"" << argv[optind + 4] - << "\" for reading"; - throw Exception::apertium_tagger::fopen(what_); - } - - FILE *UntaggedCorpus = std::fopen(argv[optind + 5], "r"); - - if (UntaggedCorpus == NULL) { - std::stringstream what_; - what_ << "can't open UNTAGGED_CORPUS file \"" << argv[optind + 5] - << "\" for reading"; - throw Exception::apertium_tagger::fopen(what_); - } - -#ifdef _MSC_VER - - _setmode(_fileno(TaggedCorpus), _O_U8TEXT); - _setmode(_fileno(UntaggedCorpus), _O_U8TEXT); - -#endif // _MSC_VER - + FILE *TaggedCorpus = try_open_file_utf8("TAGGED_CORPUS", argv[optind + 4], "r"); + FILE *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", argv[optind + 5], "r"); FILE_Tagger_.init_probabilities_from_tagged_text_(TaggedCorpus, UntaggedCorpus); + try_close_file("TAGGED_CORPUS", argv[optind + 4], TaggedCorpus); + try_close_file("UNTAGGED_CORPUS", argv[optind + 5], UntaggedCorpus); - if (std::fclose(TaggedCorpus) != 0) { - std::stringstream what_; - what_ << "can't close TAGGED_CORPUS file \"" << argv[optind + 4] << "\""; - throw Exception::apertium_tagger::fclose(what_); - } - - if (std::fclose(UntaggedCorpus) != 0) { - std::stringstream what_; - what_ << "can't close UNTAGGED_CORPUS file \"" << argv[optind + 5] << "\""; - throw Exception::apertium_tagger::fclose(what_); - } - - FILE *Corpus = std::fopen(argv[optind + 1], "r"); - - if (Corpus == NULL) { - std::stringstream what_; - what_ << "can't open CORPUS file \"" << argv[optind + 1] - << "\" for reading"; - throw Exception::apertium_tagger::fopen(what_); - } - -#ifdef _MSC_VER - - _setmode(_fileno(Corpus), _O_U8TEXT); - -#endif // _MSC_VER - + FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind + 1], "r"); FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", argv[optind + 1], UntaggedCorpus); - if (std::fclose(Corpus) != 0) { - std::stringstream what_; - what_ << "can't close CORPUS file \"" << argv[optind + 1] << "\""; - throw Exception::apertium_tagger::fclose(what_); - } - - FILE *Stream_ = std::fopen(argv[optind + 3], "wb"); - - if (Stream_ == NULL) { - std::stringstream what_; - what_ << "can't open STREAM file \"" << argv[optind + 3] - << "\" for writing in binary mode"; - throw Exception::apertium_tagger::fopen(what_); - } - - FILE_Tagger_.serialise(Stream_); - - if (std::fclose(Stream_) != 0) { - std::stringstream what_; - what_ << "can't close STREAM file \"" << argv[optind + 3] << "\""; - throw Exception::apertium_tagger::fclose(what_); - } + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 3], "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 3], UntaggedCorpus); } void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { @@ -910,63 +709,19 @@ FILE_Tagger_.set_debug(TheFlags.getDebug()); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - FILE *Dictionary = std::fopen(argv[optind], "r"); - - if (Dictionary == NULL) { - std::stringstream what_; - what_ << "can't open DICTIONARY file \"" << argv[optind] - << "\" for reading"; - throw Exception::apertium_tagger::fopen(what_); - } - + FILE *Dictionary = try_open_file("DICTIONARY", argv[optind], "r"); FILE_Tagger_.read_dictionary(Dictionary); + try_close_file("DICTIONARY", argv[optind], Dictionary); - if (std::fclose(Dictionary) != 0) { - std::stringstream what_; - what_ << "can't close DICTIONARY file \"" << argv[optind] << "\""; - throw Exception::apertium_tagger::fclose(what_); - } - - FILE *Corpus = std::fopen(argv[optind + 1], "r"); - - if (Corpus == NULL) { - std::stringstream what_; - what_ << "can't open CORPUS file \"" << argv[optind + 1] - << "\" for reading"; - throw Exception::apertium_tagger::fopen(what_); - } - -#ifdef _MSC_VER - - _setmode(_fileno(Corpus), _O_U8TEXT); - -#endif // _MSC_VER - + FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind + 1], "r"); FILE_Tagger_.init_probabilities_kupiec_(Corpus); FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", argv[optind + 1], Corpus); - if (std::fclose(Corpus) != 0) { - std::stringstream what_; - what_ << "can't close CORPUS file \"" << argv[optind + 1] << "\""; - throw Exception::apertium_tagger::fclose(what_); - } - - FILE *Stream_ = std::fopen(argv[optind + 3], "wb"); - - if (Stream_ == NULL) { - std::stringstream what_; - what_ << "can't open STREAM file \"" << argv[optind + 3] - << "\" for writing in binary mode"; - throw Exception::apertium_tagger::fopen(what_); - } - - FILE_Tagger_.serialise(Stream_); - - if (std::fclose(Stream_) != 0) { - std::stringstream what_; - what_ << "can't close STREAM file \"" << argv[optind + 3] << "\""; - throw Exception::apertium_tagger::fclose(what_); - } + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 3], "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 3], Serialised_FILE_Tagger); } } Index: trunk/apertium/apertium/exception.h =================================================================== --- trunk/apertium/apertium/exception.h (revision 69612) +++ trunk/apertium/apertium/exception.h (revision 69619) @@ -40,12 +40,9 @@ EXCEPTION(deserialise) EXCEPTION(fclose) EXCEPTION(fopen) -EXCEPTION(ifstream_fail) -EXCEPTION(ofstream_fail) +EXCEPTION(open_stream_fail) EXCEPTION(optarg_eq_NULL) EXCEPTION(str_end_not_eq_NULL) -EXCEPTION(wifstream_fail) -EXCEPTION(wofstream_fail) EXCEPTION(ERANGE_) EXCEPTION(InvalidArgument) EXCEPTION(InvalidOption) Index: trunk/apertium/apertium/lswpost.cc =================================================================== --- trunk/apertium/apertium/lswpost.cc (revision 69612) +++ trunk/apertium/apertium/lswpost.cc (revision 69619) @@ -342,14 +342,12 @@ word_left->set_show_sf(show_sf); tags_left = word_left->get_tags(); // tags left - tags_left = require_similar_ambiguity_class(tdlsw, tags_left, *word_left, debug); - + warn_absent_ambiguity_class(tdlsw, tags_left, *word_left, debug); word_mid = morpho_stream.get_next_word(); // word mid word_mid->set_show_sf(show_sf); tags_mid = word_mid->get_tags(); // tags mid - tags_mid = require_similar_ambiguity_class(tdlsw, tags_mid, *word_mid, debug); - + warn_absent_ambiguity_class(tdlsw, tags_mid, *word_mid, debug); if (morpho_stream.getEndOfFile()) { delete word_left; delete word_mid; @@ -361,9 +359,8 @@ wstring micad; while (word_right) { - tags_right = word_right->get_tags(); - tags_right = require_similar_ambiguity_class(tdlsw, tags_right, *word_right, debug); + warn_absent_ambiguity_class(tdlsw, tags_right, *word_right, debug); double max = -1; TTag tag_max = *tags_mid.begin(); Index: trunk/apertium/apertium/tagger_utils.cc =================================================================== --- trunk/apertium/apertium/tagger_utils.cc (revision 69612) +++ trunk/apertium/apertium/tagger_utils.cc (revision 69619) @@ -20,6 +20,8 @@ #include #include +#include +#include #include #ifdef _MSC_VER #define wcstok wcstok_s @@ -167,27 +169,18 @@ set tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { - int size_ret = -1; - set ret = td.getOpenClass(); // return open-class as default, if no better is found. - bool skip_class; + set &ret = td.getOpenClass(); Collection &output = td.getOutput(); - for(int k=0; k((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) { - skip_class = false; - // Test if output[k] is a subset of class - for(set::const_iterator it=output[k].begin(); it!=output[k].end(); it++) { - if (c.find(*it)==c.end()) { - skip_class = true; //output[k] is not a subset of class - break; + for (int k=0; k &ambg_class = output[k]; + if (ambg_class.size() >= ret.size()) { + continue; } + if (includes(ambg_class.begin(), ambg_class.end(), c.begin(), c.end())) { + ret = ambg_class; } - if (!skip_class) { - size_ret = output[k].size(); - ret = output[k]; } - } - } return ret; } @@ -208,10 +201,7 @@ } } -set -tagger_utils::require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { - if (td.getOutput().has_not(tags)) { - if (debug) { +static void _warn_absent_ambiguity_class(TaggerWord &word) { wstring errors; errors = L"A new ambiguity class was found. \n"; errors += L"Retraining the tagger is necessary so as to take it into account.\n"; @@ -218,6 +208,13 @@ errors += L"Word '" + word.get_superficial_form() + L"'.\n"; errors += L"New ambiguity class: " + word.get_string_tags() + L"\n"; wcerr << L"Error: " << errors; +} + +set +tagger_utils::require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags)) { + if (debug) { + _warn_absent_ambiguity_class(word); } return find_similar_ambiguity_class(td, tags); } @@ -224,6 +221,13 @@ return tags; } +void +tagger_utils::warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags) && debug) { + _warn_absent_ambiguity_class(word); + } +} + template ostream& operator<< (ostream& os, const map & f){ typename map ::const_iterator it; Index: trunk/apertium/apertium/tagger_utils.h =================================================================== --- trunk/apertium/apertium/tagger_utils.h (revision 69612) +++ trunk/apertium/apertium/tagger_utils.h (revision 69619) @@ -91,6 +91,9 @@ * & prints a warning if debug */ set require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); +/** Just prints a warning if debug */ +void warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); + wstring trim(wstring s); }; Index: trunk/apertium/apertium/hmm.cc =================================================================== --- trunk/apertium/apertium/hmm.cc (revision 69612) +++ trunk/apertium/apertium/hmm.cc (revision 69619) @@ -710,7 +710,7 @@ TaggerWord *word=NULL; TTag tag; - set tags, pretags; + set ambg_class_tags, tags, pretags; set ::iterator itag, jtag; double prob, loli, x; @@ -750,9 +750,9 @@ if (tags.size()==0) // This is an unknown word tags = tdhmm.getOpenClass(); - tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug); + ambg_class_tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug); - k = output[tags]; //Ambiguity class the word belongs to + k = output[ambg_class_tags]; //Ambiguity class the word belongs to #ifdef __GNUC__ clear_array_double(alpha[nwpend%2], N); Index: trunk/apertium/tests/tagger/__init__.py =================================================================== --- trunk/apertium/tests/tagger/__init__.py (nonexistent) +++ trunk/apertium/tests/tagger/__init__.py (revision 69619) @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import functools +import unittest +import tempfile +from os.path import join as pjoin +from os.path import abspath, dirname +from subprocess import (check_call, check_output, Popen, PIPE, DEVNULL, + TimeoutExpired, CalledProcessError) + + +# Utilities +def tmp(contents): + t = tempfile.NamedTemporaryFile(mode='w', delete=False) + t.write(contents) + return t.name + + +def rel(fn): + return abspath(pjoin(dirname(abspath(__file__)), fn)) + + +APERTIUM_TAGGER = rel("../../apertium/apertium-tagger") + + +def check_stderr(*popenargs, timeout=None, **kwargs): + # Essentially a copypasted version of check_output. + # Can be significantly abridged with Python 3.5's run(...) + if 'stderr' in kwargs: + raise ValueError('stderr argument not allowed, it will be overridden.') + if 'input' in kwargs: + if 'stdin' in kwargs: + raise ValueError('stdin and input arguments may not both be used.') + inputdata = kwargs['input'] + del kwargs['input'] + kwargs['stdin'] = PIPE + else: + inputdata = None + with Popen(*popenargs, stderr=PIPE, **kwargs) as process: + try: + unused_output, err = process.communicate(inputdata, + timeout=timeout) + except TimeoutExpired: + process.kill() + unused_output, err = process.communicate() + raise TimeoutExpired(process.args, timeout, output=err) + except: + process.kill() + process.wait() + raise + retcode = process.poll() + if retcode: + raise CalledProcessError(retcode, process.args, output=err) + return err + + +def trace_dec(f): + @functools.wraps(f) + def inner(*args, **kwargs): + if len(args) > 0: + print("run " + " ".join(args[0])) + return f(*args, **kwargs) + return inner + + +def trace_plus_unicode(f): + return functools.partial(trace_dec(f), universal_newlines=True) + +check_call = trace_plus_unicode(check_call) +check_output = trace_plus_unicode(check_output) +check_stderr = trace_plus_unicode(check_stderr) + +# Test files +DIC = """ +^the/the$ +^books/book/book$ +^has/have$ +^booked/book/book$ +^close/close/close/close/close/close$ +^cat/cat$ +^room/room$ +^red/red$ +^./.$ +""".strip() + +TSX = """ + + + + + + + + + + + + + + + + + + + + +""".strip() + +TRAIN_NO_PROBLEM_UNTAGGED = """ +^The/the$ +^cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book/book$ +^the/the$ +^red/red$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +TRAIN_NO_PROBLEM_TAGGED = """ +^The/the$ +^cat/cat$ +^books/book$ +^the/the$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book$ +^the/the$ +^red/red$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +TRAIN_CAT_TO_BE_A_VERB_UNTAGGED = """ +^The/The$ +^falling/fall/fall/fall$ +^cat/cat$ +^has/have$ +^booked/book/book$ +^books/book/book$ +^./.$ + +^Close/close/close/close/close/close$ +^the/the$ +^books/book/book$ +^./.$ + +^The/the$ +^falling/fall/fall/fall$ +^cat/cat$ +^has/have$ +^books/book/book$ +^./.$ +""".strip() + +TRAIN_CAT_TO_BE_A_VERB_TAGGED = """ +^The/The$ +^falling/fall$ +^cat/cat$ +^has/have$ +^booked/book$ +^books/book$ +^./.$ + +^Close/close$ +^the/the$ +^books/book$ +^./.$ + +^The/the$ +^falling/fall$ +^cat/cat$ +^has/have$ +^books/book$ +^./.$ +""".strip() + +TEST_SUCCESS = """ +^The/the$ +^cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +TEST_NEW_AMBG_CLASS = """ +^The/the$ +^cat/cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +# Expected strings +EXPECTED_SUBST = """ +Error: A new ambiguity class was found. +Retraining the tagger is necessary so as to take it into account. +Word 'cat'. +New ambiguity class: {NOUN,ADJ} +""".strip().split("\n") + + +# Tests +class AmbiguityClassTest(unittest.TestCase): + def setUp(self): + self.tsx_fn = tmp(TSX) + self.dic_fn = tmp(DIC) + + def changing_class_impl(self, flags, model_fn): + test1 = tmp(TEST_SUCCESS) + test2 = tmp(TEST_NEW_AMBG_CLASS) + success_stderr = check_stderr( + [APERTIUM_TAGGER, '-d'] + flags + + ['-g', model_fn, test1], + stdout=DEVNULL) + self.assertEqual(success_stderr.strip(), "") + subst_stderr = check_stderr( + [APERTIUM_TAGGER, '-d'] + flags + + ['-g', model_fn, test2], + stdout=DEVNULL) + subst_stderr = [line.strip() + for line in subst_stderr.strip().split("\n")] + self.assertEqual(subst_stderr, EXPECTED_SUBST) + ambg_class = check_output( + [rel('test-find-similar-ambiguity-class'), model_fn], + input="NOUN ADJ\n") + substituted_class = set(ambg_class.split(" ")) + # Should get open class + self.assertSetEqual(substituted_class, set(("VERB", "NOUN", "ADJ"))) + + def test_changing_class_hmm_sup(self): + model_fn = tmp("") + untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + tagged = tmp(TRAIN_NO_PROBLEM_TAGGED) + check_call( + [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn, + model_fn, tagged, untagged]) + self.changing_class_impl([], model_fn) + + def test_changing_class_hmm_unsup(self): + model_fn = tmp("") + untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + check_call( + [APERTIUM_TAGGER, '-t', '1', self.dic_fn, untagged, self.tsx_fn, + model_fn]) + self.changing_class_impl([], model_fn) + + def test_changing_class_sliding_window(self): + model_fn = tmp("") + untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + check_call( + [APERTIUM_TAGGER, '--sliding-window', '-t', '1', self.dic_fn, + untagged, self.tsx_fn, model_fn]) + self.changing_class_impl(['--sliding-window'], model_fn) + + def test_cat_is_a_verb(self): + model_fn = tmp("") + untagged = tmp(TRAIN_CAT_TO_BE_A_VERB_UNTAGGED) + tagged = tmp(TRAIN_CAT_TO_BE_A_VERB_TAGGED) + new_ambg_class = tmp(TEST_NEW_AMBG_CLASS) + check_call( + [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn, + model_fn, tagged, untagged]) + subst_stdout = check_output( + [APERTIUM_TAGGER, '-d', '-g', model_fn, new_ambg_class], + stderr=DEVNULL) + acceptable = False + for line in subst_stdout.split("\n"): + if (line.startswith('^cat') and ('' in line or '' in line)): + acceptable = True + self.assertTrue( + acceptable, + "'cat' must be output and tagged as an adjective or a noun.\n" + + "Actual output:\n{}".format(subst_stdout)) Property changes on: trunk/apertium/tests/tagger/__init__.py ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: trunk/apertium/tests/tagger/Makefile.am =================================================================== --- trunk/apertium/tests/tagger/Makefile.am (nonexistent) +++ trunk/apertium/tests/tagger/Makefile.am (revision 69619) @@ -0,0 +1,14 @@ +library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME) + +bin_PROGRAMS = test-find-similar-ambiguity-class +bin_SCRIPTS = $(GENERATEDSCRIPTS) + +AM_CPPFLAGS = -I$(top_srcdir) + +apertiumdir = $(prefix)/share/apertium +apertiuminclude = $(prefix)/include/apertium-$(GENERIC_API_VERSION) +apertiumlib = $(prefix)/lib +apertiumsysconf = $(prefix)/etc/apertium + +test_find_similar_ambiguity_class_SOURCES = test_find_similar_ambiguity_classes.cc +test_find_similar_ambiguity_class_LDADD = -L$(top_srcdir)/$(GENERIC_LIBRARY_NAME)/.libs/ $(APERTIUM_LIBS) -l$(GENERIC_LIBRARY_NAME)$(GENERIC_MAJOR_VERSION) Index: trunk/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc =================================================================== --- trunk/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc (nonexistent) +++ trunk/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc (revision 69619) @@ -0,0 +1,61 @@ +#include "apertium/utf_converter.h" +#include "apertium/tagger_utils.h" +#include "apertium/tagger_data_hmm.h" +#include "apertium/tagger_data.h" +#include +#include +#include +#include + +void print_ambiguity_class(const vector &array_tags, const set &abgset) +{ + unsigned int j; + set::const_iterator abgseti; + for (abgseti=abgset.begin(), j=0; abgseti!=abgset.end(); abgseti++, j++) { + wcout << array_tags[*abgseti]; + if (j < abgset.size() - 1) { + wcout << " "; + } + } +} + +void find_similar_ambiguity_class_io(TaggerData &td) +{ + vector &array_tags = td.getArrayTags(); + wstring line = L""; + getline(wcin, line, L'\n'); + + wstringstream line_stream(line); + set ambiguity_class; + wstring tag_name; + while (line_stream >> tag_name) { + vector::iterator it; + it = find(array_tags.begin(), array_tags.end(), tag_name); + if (it == array_tags.end()) { + wcerr << L"Tag not in model: " << tag_name << L'\n'; + exit(-3); + } + ambiguity_class.insert(it - array_tags.begin()); + } + set similar_ambiguity_class = tagger_utils::find_similar_ambiguity_class(td, ambiguity_class); + print_ambiguity_class(array_tags, similar_ambiguity_class); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) { + cerr<<"Usage: "<\n"; + exit(-1); + } + char* probfile = argv[1]; + TaggerDataHMM tagger_data_hmm; + FILE* fin = fopen(probfile, "r"); + if (!fin) { + cerr<<"Error: cannot open file '"<