Index: branches/apertium-tagger/apertium2/apertium/apertium_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_tagger.cc (revision 69632) @@ -0,0 +1,737 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "apertium_tagger.h" + +#include "apertium_config.h" + +#include "align.h" +#include "basic_exception_type.h" +#include "basic_stream_tagger.h" +#include "basic_stream_tagger_trainer.h" +#include "basic_tagger.h" +#include "err_exception.h" +#include "exception.h" +#include "file_tagger.h" +#include "linebreak.h" +#include "stream_5_3_1_tagger.h" +#include "stream_5_3_1_tagger_trainer.h" +#include "stream_5_3_2_tagger.h" +#include "stream_5_3_2_tagger_trainer.h" +#include "stream_5_3_3_tagger.h" +#include "stream_5_3_3_tagger_trainer.h" +#include +#include +#include + +#include + +#include "getopt_long.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#include +#include +#endif // _MSC_VER + +namespace Apertium { +apertium_tagger::apertium_tagger(int &argc, char **&argv) + : argc(argc), argv(argv), The_val(), + + The_indexptr(), FunctionTypeTypeOption_indexptr(), + FunctionTypeOption_indexptr(), + + TheFunctionTypeType(), TheUnigramType(), TheFunctionType(), + TheFunctionTypeOptionArgument(0), TheFlags() { + try { + while (true) { + The_val = getopt_long(argc, argv, "dfgmpr:s:t:u:wz", longopts, &The_indexptr); + + if (The_val == -1) + break; + + set_indexptr(); + + switch (The_val) { + case 'd': + flagOptionCase(&basic_Tagger::Flags::getDebug, + &basic_Tagger::Flags::setDebug); + break; + case 'f': + flagOptionCase(&basic_Tagger::Flags::getFirst, + &basic_Tagger::Flags::setFirst); + break; + case 'm': + flagOptionCase(&basic_Tagger::Flags::getMark, + &basic_Tagger::Flags::setMark); + break; + case 'p': + flagOptionCase(&basic_Tagger::Flags::getShowSuperficial, + &basic_Tagger::Flags::setShowSuperficial); + break; + case 'z': + flagOptionCase(&basic_Tagger::Flags::getNullFlush, + &basic_Tagger::Flags::setNullFlush); + break; + case 'u': + functionTypeTypeOptionCase(Unigram); + + if (std::strncmp(optarg, "1", sizeof "1" - 1) == 0) { + TheUnigramType = Stream_5_3_1; + break; + } + + if (std::strncmp(optarg, "2", sizeof "2" - 1) == 0) { + TheUnigramType = Stream_5_3_2; + break; + } + + if (std::strncmp(optarg, "3", sizeof "3" - 1) == 0) { + TheUnigramType = Stream_5_3_3; + break; + } + + { + std::stringstream what_; + what_ << "invalid argument '" << optarg << "' for '--unigram'\n" + "Valid arguments are:\n" + " - '1'\n" + " - '2'\n" + " - '3'"; + throw Exception::apertium_tagger::InvalidArgument(what_); + } + break; + case 'w': + functionTypeTypeOptionCase(SlidingWindow); + break; + case 'g': + functionTypeOptionCase(Tagger); + break; + case 'r': + functionTypeOptionCase(Retrain); + getIterationsArgument(); + break; + case 's': + functionTypeOptionCase(Supervised); + getIterationsArgument(); + break; + case 't': + functionTypeOptionCase(Train); + getIterationsArgument(); + break; + case 'h': + help(); + return; + default: + throw err_Exception(); + } + } + + if (!TheFunctionType) { + help(); + return; + } + + switch (*TheFunctionType) { + case Tagger: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + g_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + switch (*TheUnigramType) { + case Stream_5_3_1: { + Stream_5_3_1_Tagger Stream_5_3_1_Tagger_(TheFlags); + g_StreamTagger(Stream_5_3_1_Tagger_); + } break; + case Stream_5_3_2: { + Stream_5_3_2_Tagger Stream_5_3_2_Tagger_(TheFlags); + g_StreamTagger(Stream_5_3_2_Tagger_); + } break; + case Stream_5_3_3: { + Stream_5_3_3_Tagger Stream_5_3_3_Tagger_(TheFlags); + g_StreamTagger(Stream_5_3_3_Tagger_); + } break; + default: + std::abort(); + } + } break; + case SlidingWindow: { + LSWPoST SlidingWindowTagger_; + g_FILE_Tagger(SlidingWindowTagger_); + } break; + default: + std::abort(); + } + + break; + case Retrain: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + r_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + std::stringstream what_; + what_ << "invalid option -- 'u'"; + throw Exception::apertium_tagger::InvalidOption(what_); + } + case SlidingWindow: { + LSWPoST SlidingWindowTagger_; + r_FILE_Tagger(SlidingWindowTagger_); + } break; + default: + std::abort(); + } + + break; + case Supervised: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + s_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + switch (*TheUnigramType) { + case Stream_5_3_1: { + Stream_5_3_1_TaggerTrainer Stream_5_3_1_TaggerTrainer_(TheFlags); + s_StreamTaggerTrainer(Stream_5_3_1_TaggerTrainer_); + } break; + case Stream_5_3_2: { + Stream_5_3_2_TaggerTrainer Stream_5_3_2_TaggerTrainer_(TheFlags); + s_StreamTaggerTrainer(Stream_5_3_2_TaggerTrainer_); + } break; + case Stream_5_3_3: { + Stream_5_3_3_TaggerTrainer Stream_5_3_3_TaggerTrainer_(TheFlags); + s_StreamTaggerTrainer(Stream_5_3_3_TaggerTrainer_); + } break; + default: + std::abort(); + } + } break; + case SlidingWindow: { + std::stringstream what_; + what_ << "invalid option -- 'w'"; + throw Exception::apertium_tagger::InvalidOption(what_); + } + default: + std::abort(); + } + + break; + case Train: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + t_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + std::stringstream what_; + what_ << "invalid option -- 'u'"; + throw Exception::apertium_tagger::InvalidOption(what_); + } + case SlidingWindow: { + LSWPoST SlidingWindowTagger_; + t_FILE_Tagger(SlidingWindowTagger_); + } break; + default: + std::abort(); + } + + break; + default: + std::abort(); + } + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::cerr << "apertium-tagger: " << basic_ExceptionType_.what() << '\n'; + throw err_Exception(); + } +} + +void apertium_tagger::help() { + + std::cerr << +"Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" +" [INPUT \\\n" +" [OUTPUT]]\n" +"\n" +" or: apertium-tagger [OPTION]... -r ITERATIONS \\\n" +" CORPUS \\\n" +" SERIALISED_TAGGER\n" +"\n" +" or: apertium-tagger [OPTION]... -s ITERATIONS \\\n" +" DICTIONARY \\\n" +" CORPUS \\\n" +" TAGGER_SPECIFICATION \\\n" +" SERIALISED_TAGGER \\\n" +" TAGGED_CORPUS \\\n" +" UNTAGGED_CORPUS\n" +"\n" +" or: apertium-tagger [OPTION]... -s 0 \\\n" +" -u MODEL \\\n" +" SERIALISED_TAGGER \\\n" +" TAGGED_CORPUS\n" +"\n" +" or: apertium-tagger [OPTION]... -t ITERATIONS \\\n" +" DICTIONARY \\\n" +" CORPUS \\\n" +" TAGGER_SPECIFICATION \\\n" +" SERIALISED_TAGGER\n" +"\n" +"\n" +"Mandatory arguments to long options are mandatory for short options too.\n" +"\n"; + + std::vector > options_description_; + options_description_.push_back(std::make_pair("-d, --debug", "with -g, print error messages about the input")); + options_description_.push_back(std::make_pair("-f, --first", "with -g, reorder each lexical unit's analyses so that the chosen one is first")); + options_description_.push_back(std::make_pair("-m, --mark", "with -g, mark disambiguated lexical units")); + options_description_.push_back(std::make_pair("-p, --show-superficial", "with -g, output each lexical unit's surface form")); + options_description_.push_back(std::make_pair("-z, --null-flush", "with -g, flush the output after getting each null character")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-u, --unigram=MODEL", "use unigram algorithm MODEL from ")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-w, --sliding-window", "use the Light Sliding Window algorithm")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-g, --tagger", "disambiguate the input")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-r, --retrain=ITERATIONS", "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations")); + options_description_.push_back(std::make_pair("-s, --supervised=ITERATIONS", "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations")); + options_description_.push_back(std::make_pair("-t, --train=ITERATIONS", "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-h, --help", "display this help and exit")); + align::align_(options_description_); +} + +std::string apertium_tagger::option_string(const int &indexptr_) { + return option_string(longopts[indexptr_]); +} + +std::string apertium_tagger::option_string(const struct option &option_) { + std::stringstream option_string_; + option_string_ << "--" << option_.name; + return option_string_.str(); +} + +void apertium_tagger::locale_global_() { + +#if defined __clang__ + + std::locale::global(std::locale("")); + +#else +#if defined __APPLE__ + + LtLocale::tryToSetLocale(); + +#else + + std::locale::global(std::locale("")); + +#endif // defined __APPLE__ +#endif // defined __clang__ +} + +const struct option apertium_tagger::longopts[] = { + {"help", no_argument, 0, 'h'}, + {"debug", no_argument, 0, 'd'}, + {"first", no_argument, 0, 'f'}, + {"mark", no_argument, 0, 'm'}, + {"show-superficial", no_argument, 0, 'p'}, + {"null-flush", no_argument, 0, 'z'}, + {"unigram", required_argument, 0, 'u'}, + {"sliding-window", no_argument, 0, 'w'}, + {"tagger", no_argument, 0, 'g'}, + {"retrain", required_argument, 0, 'r'}, + {"supervised", required_argument, 0, 's'}, + {"train", required_argument, 0, 't'}, + {0, 0, 0, 0}}; + +void apertium_tagger::set_indexptr() { + if (The_val == longopts[The_indexptr].val) + return; + + for (std::size_t longopts_Index = 0; longopts[longopts_Index].val != 0; + ++longopts_Index) { + if (The_val == longopts[longopts_Index].val) { + The_indexptr = longopts_Index; + return; + } + } +} + +void apertium_tagger::flagOptionCase( + bool (basic_Tagger::Flags::*GetFlag)() const, + void (basic_Tagger::Flags::*SetFlag)(const bool &)) { + if ((TheFlags.*GetFlag)()) { + std::stringstream what_; + what_ << "unexpected '" << option_string() << "' following '" + << option_string() << '\''; + throw Exception::apertium_tagger::UnexpectedFlagOption(what_); + } + + (TheFlags.*SetFlag)(true); +} + +std::string apertium_tagger::option_string() { + return option_string(The_indexptr); +} + +void apertium_tagger::functionTypeTypeOptionCase( + const FunctionTypeType &FunctionTypeType_) { + if (FunctionTypeTypeOption_indexptr) { + std::stringstream what_; + what_ << "unexpected '" << option_string() << "' following '" + << option_string(*FunctionTypeTypeOption_indexptr) + << '\''; + throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_); + } + + TheFunctionTypeType = FunctionTypeType_; + FunctionTypeTypeOption_indexptr = The_indexptr; +} + +void apertium_tagger::functionTypeOptionCase( + const FunctionType &FunctionType_) { + if (FunctionTypeOption_indexptr) { + std::stringstream what_; + what_ << "unexpected '" << option_string() << "' following '" + << option_string(*FunctionTypeOption_indexptr) + << '\''; + throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_); + } + + TheFunctionType = FunctionType_; + FunctionTypeOption_indexptr = The_indexptr; +} + +void apertium_tagger::getIterationsArgument() { + try { + TheFunctionTypeOptionArgument = optarg_unsigned_long(); + } catch (const ExceptionType &ExceptionType_) { + std::stringstream what_; + what_ << "invalid argument '" << optarg << "' for '" << option_string() + << '\''; + throw Exception::apertium_tagger::InvalidArgument(what_); + } +} + +unsigned long apertium_tagger::optarg_unsigned_long() const { + char *str_end; + errno = 0; + unsigned long N_0 = std::strtoul(optarg, &str_end, 10); + + if (*str_end != '\0') { + std::stringstream what_; + what_ << "can't convert char *optarg \"" << optarg << "\" to unsigned long"; + throw Exception::apertium_tagger::str_end_not_eq_NULL(what_); + } + + if (*optarg == '\0') { + std::stringstream what_; + what_ << "can't convert char *optarg of size 1 \"\" to unsigned long"; + throw Exception::apertium_tagger::optarg_eq_NULL(what_); + } + + if (errno == ERANGE) { + std::stringstream what_; + what_ << "can't convert char *optarg \"" << optarg + << "\" to unsigned long, not in unsigned long range"; + throw Exception::apertium_tagger::ERANGE_(what_); + } + + return N_0; +} + +template +static void try_open_fstream(const char *metavar, const char *filename, + T &stream) { + stream.open(filename); + if (stream.fail()) { + std::stringstream what_; + what_ << "can't open " << metavar << " file \"" << filename << "\""; + throw Exception::apertium_tagger::open_stream_fail(what_); + } +} + +static FILE *try_open_file(const char *metavar, const char *filename, + const char *flags) { + FILE *f = std::fopen(filename, flags); + if (f == NULL) { + std::stringstream what_; + what_ << "can't open " << metavar << " file \"" << filename << "\""; + throw Exception::apertium_tagger::fopen(what_); + } + return f; +} + +static inline FILE *try_open_file_utf8(const char *metavar, const char *filename, + const char *flags) { + FILE *f = try_open_file(metavar, filename, flags); +#ifdef _MSC_VER + _setmode(_fileno(f), _O_U8TEXT); +#endif // _MSC_VER + return f; +} + +static void try_close_file(const char *metavar, const char *filename, FILE *file) { + if (std::fclose(file) != 0) { + std::stringstream what_; + what_ << "can't close " << metavar << " file \"" << filename << "\""; + throw Exception::apertium_tagger::fclose(what_); + } +} + +void apertium_tagger::g_StreamTagger(basic_StreamTagger &StreamTagger_) { + locale_global_(); + + if (argc - optind < 1 || !(argc - optind < 4)) { + std::stringstream what_; + what_ << "expected 1, 2, or 3 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + std::ifstream SerialisedAnalysisFrequencies; + try_open_fstream("SERIALISED_TAGGER", argv[optind], + SerialisedAnalysisFrequencies); + + try { + StreamTagger_.deserialise(SerialisedAnalysisFrequencies); + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't deserialise SERIALISED_TAGGER file \"" << argv[optind] + << "\" Reason: " << basic_ExceptionType_.what(); + throw Exception::apertium_tagger::deserialise(what_); + } + + if (argc - optind < 2) { + Stream Input(TheFlags); + StreamTagger_.tag(Input, std::wcout); + return; + } + + std::wifstream Input_stream; + try_open_fstream("INPUT", argv[optind + 1], Input_stream); + + if (argc - optind < 3) { + Stream Input(TheFlags, Input_stream, argv[optind + 1]); + StreamTagger_.tag(Input, std::wcout); + return; + } + + std::wofstream Output_stream; + try_open_fstream("OUTPUT", argv[optind + 2], Input_stream); + + Stream Input(TheFlags, Input_stream, argv[optind + 1]); + StreamTagger_.tag(Input, Output_stream); +} + +void apertium_tagger::s_StreamTaggerTrainer( + basic_StreamTaggerTrainer &StreamTaggerTrainer_) { + locale_global_(); + + if (TheFunctionTypeOptionArgument != 0) { + std::stringstream what_; + what_ << "invalid argument '" << TheFunctionTypeOptionArgument + << "' for '--supervised'"; + throw Exception::apertium_tagger::InvalidArgument(what_); + } + + if (argc - optind < 2 || !(argc - optind < 3)) { + std::stringstream what_; + what_ << "expected 2 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + std::wifstream TaggedCorpus_stream; + try_open_fstream("TAGGED_CORPUS", argv[optind + 1], TaggedCorpus_stream); + + Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind]); + StreamTaggerTrainer_.train(TaggedCorpus); + + std::ofstream Serialised_basic_Tagger; + try_open_fstream("SERIALISED_TAGGER", argv[optind], + Serialised_basic_Tagger); + + StreamTaggerTrainer_.serialise(Serialised_basic_Tagger); +} + +void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + if (argc - optind < 1 || !(argc - optind < 4)) { + std::stringstream what_; + what_ << "expected 1, 2, or 3 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind], "rb"); + FILE_Tagger_.deserialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); + + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + TaggerWord::generate_marks = TheFlags.getMark(); + FILE_Tagger_.set_show_sf(TheFlags.getShowSuperficial()); + FILE_Tagger_.setNullFlush(TheFlags.getNullFlush()); + + if (argc - optind < 2) + FILE_Tagger_.tagger(stdin, stdout, TheFlags.getFirst()); + else { + FILE *Input = try_open_file("INPUT", argv[optind + 1], "r"); + + if (argc - optind < 3) + FILE_Tagger_.tagger(Input, stdout, TheFlags.getFirst()); + else { + FILE *Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); + FILE_Tagger_.tagger(Input, Output, TheFlags.getFirst()); + try_close_file("OUTPUT", argv[optind + 2], Output); + } + + try_close_file("INPUT", argv[optind + 1], Input); + } +} + +void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + if (argc - optind < 2 || !(argc - optind < 3)) { + std::stringstream what_; + what_ << "expected 2 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 1], "rb"); + FILE_Tagger_.deserialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 1], Serialised_FILE_Tagger); + + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + + FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind], "r"); + FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", argv[optind], Corpus); + + Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 1], "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 1], Serialised_FILE_Tagger); +} + +void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + if (argc - optind < 6 || !(argc - optind < 7)) { + std::stringstream what_; + what_ << "expected 6 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + FILE_Tagger_.deserialise(argv[optind + 2]); + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + + FILE *Dictionary = try_open_file("DICTIONARY", argv[optind], "r"); + FILE_Tagger_.read_dictionary(Dictionary); + try_close_file("DICTIONARY", argv[optind], Dictionary); + + FILE *TaggedCorpus = try_open_file_utf8("TAGGED_CORPUS", argv[optind + 4], "r"); + FILE *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", argv[optind + 5], "r"); + FILE_Tagger_.init_probabilities_from_tagged_text_(TaggedCorpus, + UntaggedCorpus); + try_close_file("TAGGED_CORPUS", argv[optind + 4], TaggedCorpus); + try_close_file("UNTAGGED_CORPUS", argv[optind + 5], UntaggedCorpus); + + FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind + 1], "r"); + FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", argv[optind + 1], UntaggedCorpus); + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 3], "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 3], UntaggedCorpus); +} + +void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + if (argc - optind < 4 || !(argc - optind < 5)) { + std::stringstream what_; + what_ << "expected 4 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + FILE_Tagger_.deserialise(argv[optind + 2]); + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + + FILE *Dictionary = try_open_file("DICTIONARY", argv[optind], "r"); + FILE_Tagger_.read_dictionary(Dictionary); + try_close_file("DICTIONARY", argv[optind], Dictionary); + + FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind + 1], "r"); + FILE_Tagger_.init_probabilities_kupiec_(Corpus); + FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", argv[optind + 1], Corpus); + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 3], "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 3], Serialised_FILE_Tagger); +} +} + +int main(int argc, char **argv) { + try { + apertium_tagger(argc, argv); + } catch (const err_Exception &err_Exception_) { + std::cerr << "Try 'apertium-tagger --help' for more information.\n"; + return 1; + } catch (...) { + throw; + } +} Index: branches/apertium-tagger/apertium2/apertium/exception.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/exception.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/exception.h (revision 69632) @@ -0,0 +1,92 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef EXCEPTION_APERTIUM_TAGGER_H +#define EXCEPTION_APERTIUM_TAGGER_H + +#include "exception_type.h" + +#include + +namespace Apertium { +namespace Exception { + +#define EXCEPTION(EXCEPTION_TYPE) \ + class EXCEPTION_TYPE : public ::Apertium::ExceptionType { \ + public: \ + EXCEPTION_TYPE(const char *const what_) : ExceptionType(what_) {} \ + EXCEPTION_TYPE(const std::string &what_) : ExceptionType(what_) {} \ + EXCEPTION_TYPE(const std::stringstream &what_) : ExceptionType(what_) {} \ + ~EXCEPTION_TYPE() throw() {} \ + }; + +namespace Analysis { +EXCEPTION(TheMorphemes_empty) +} + +namespace apertium_tagger { +EXCEPTION(deserialise) +EXCEPTION(fclose) +EXCEPTION(fopen) +EXCEPTION(open_stream_fail) +EXCEPTION(optarg_eq_NULL) +EXCEPTION(str_end_not_eq_NULL) +EXCEPTION(ERANGE_) +EXCEPTION(InvalidArgument) +EXCEPTION(InvalidOption) +EXCEPTION(UnexpectedFileArgumentCount) +EXCEPTION(UnexpectedFlagOption) +EXCEPTION(UnexpectedFunctionTypeOption) +EXCEPTION(UnexpectedFunctionTypeTypeOption) +} + +namespace Deserialiser { +EXCEPTION(size_t_) +EXCEPTION(not_Stream_good) +EXCEPTION(wchar_t_) +} + +namespace LexicalUnit { +EXCEPTION(TheAnalyses_empty) +} + +namespace Morpheme { +EXCEPTION(TheLemma_empty) +EXCEPTION(TheTags_empty) +} + +namespace Optional { +EXCEPTION(TheOptionalTypePointer_null) +} + +namespace Serialiser { +EXCEPTION(not_Stream_good) +EXCEPTION(size_t_) +EXCEPTION(wchar_t_) +} + +namespace Tag { +EXCEPTION(TheTags_empty) +} + +namespace wchar_t_ExceptionType { +EXCEPTION(EILSEQ_) +} + +#undef EXCEPTION +} +} + +#endif // EXCEPTION_APERTIUM_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/lswpost.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/lswpost.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lswpost.cc (revision 69632) @@ -0,0 +1,402 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/** + * Light Sliding-Window Part of Speech Tagger (LSWPoST) implementation (source) + * + * @author Gang Chen - pkuchengang@gmail.com + */ + + +#include +#include +#include "apertium_config.h" +#include +#include + +#ifdef WIN32 +#define isnan(n) _isnan(n) +#define isinf(n) (!_finite(n)) +#endif + +#ifdef __clang__ +#undef __GNUC__ +#endif + +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Apertium; +using namespace tagger_utils; + +void LSWPoST::deserialise(FILE *Serialised_FILE_Tagger) { + tdlsw.read(Serialised_FILE_Tagger); + eos = (tdlsw.getTagIndex())[L"TAG_SENT"]; +} + +std::vector &LSWPoST::getArrayTags() { + return tdlsw.getArrayTags(); +} + +void LSWPoST::serialise(FILE *Stream_) { tdlsw.write(Stream_); } + +void LSWPoST::deserialise(const TaggerData &Deserialised_FILE_Tagger) { + tdlsw = TaggerDataLSW(Deserialised_FILE_Tagger); + eos = (tdlsw.getTagIndex())[L"TAG_SENT"]; +} + +void LSWPoST::init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *UntaggedCorpus) { + std::abort(); +} + +void LSWPoST::init_probabilities_kupiec_(FILE *Corpus) { + init_probabilities(Corpus); +} + +void LSWPoST::train(FILE *Corpus, unsigned long Count) { + for (; Count > 0; --Count) { + std::fseek(Corpus, 0, SEEK_SET); + train(Corpus); + } +} + +LSWPoST::LSWPoST() {} + +LSWPoST::LSWPoST(TaggerDataLSW t) { + tdlsw = t; + eos = (tdlsw.getTagIndex())[L"TAG_SENT"]; +} + +LSWPoST::~LSWPoST() {} + +LSWPoST::LSWPoST(TaggerDataLSW *tdlsw) : tdlsw(*tdlsw) {} + +void +LSWPoST::set_eos(TTag t) { + eos = t; +} + +void +LSWPoST::init_probabilities(FILE *ftxt) { + + int N = tdlsw.getN(); + int nw = 0; + TaggerWord *word = NULL; + set tags_left, tags_mid, tags_right; + set::iterator iter_left, iter_mid, iter_right; + vector > > para_matrix(N, vector >(N, vector(N, 0))); + MorphoStream morpho_stream(ftxt, true, &tdlsw); + int num_valid_seq = 0; + + word = new TaggerWord(); // word for tags left + word->add_tag(eos, L"sent", tdlsw.getPreferRules()); + tags_left = word->get_tags(); // tags left + if (tags_left.size()==0) { //This is an unknown word + tags_left = tdlsw.getOpenClass(); + } + + require_ambiguity_class(tdlsw, tags_left, *word, nw); + ++nw; + delete word; + word = morpho_stream.get_next_word(); // word for tags mid + tags_mid = word->get_tags(); // tags mid + if (tags_mid.size()==0) { //This is an unknown word + tags_mid = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_mid, *word, nw); + ++nw; + delete word; + if (morpho_stream.getEndOfFile()) { + return; + } + + word = morpho_stream.get_next_word(); // word for tags right + + // count each element of the para matrix + while (word != NULL) { + if (++nw % 10000 == 0) { + wcerr << L'.' << flush; + } + + tags_right = word->get_tags(); // tags right + if (tags_right.size()==0) { //This is an unknown word + tags_right = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_right, *word, nw); + + num_valid_seq = tags_left.size() * tags_mid.size() * tags_right.size(); + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + if (!is_valid_seq(*iter_left, *iter_mid, *iter_right)) { + --num_valid_seq; + } + } // for iter_right + } // for iter_mid + } // for iter_left + + if (num_valid_seq != 0) { + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + if (is_valid_seq(*iter_left, *iter_mid, *iter_right)) { + para_matrix[*iter_left][*iter_mid][*iter_right] += 1.0 / num_valid_seq; + } + } // for iter_right + } // for iter_mid + } // for iter_left + } + + tags_left = tags_mid; + tags_mid = tags_right; + delete word; + word = morpho_stream.get_next_word(); + } // while word != NULL + + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + for (int k = 0; k < N; ++k) { + tdlsw.getD()[i][j][k] = para_matrix[i][j][k]; + } + } + } + + wcerr << L"\n"; +} + +bool LSWPoST::is_valid_seq(TTag left, TTag mid, TTag right) { + + vector &forbid_rules = tdlsw.getForbidRules(); + vector &enforce_rules = tdlsw.getEnforceRules(); + + for (size_t r = 0; r < forbid_rules.size(); ++r) { + if ((left == forbid_rules[r].tagi && mid == forbid_rules[r].tagj) + || (mid == forbid_rules[r].tagi && right == forbid_rules[r].tagj)) { + return false; + } + }// for r in forbid rules + + for (size_t r = 0; r < enforce_rules.size(); ++r) { + if (left == enforce_rules[r].tagi) { + bool found = false; + for (size_t j = 0; j < enforce_rules[r].tagsj.size(); ++j) { + if (enforce_rules[r].tagsj[j] == mid) { + found = true; + break; + } + } + if (!found) { + return false; + } + } else if (mid == enforce_rules[r].tagi) { + bool found = false; + for (size_t j = 0; j < enforce_rules[r].tagsj.size(); ++j) { + if (enforce_rules[r].tagsj[j] == right) { + found = true; + break; + } + } + if (!found) { + return false; + } + } + } // for r in enforce rules + + return true; +} + +void +LSWPoST::read_dictionary(FILE *fdic) { + tagger_utils::read_dictionary(fdic, tdlsw); + int N = (tdlsw.getTagIndex()).size(); + int M = (tdlsw.getOutput()).size(); + wcerr << N << L" states and " << M < tags_left, tags_mid, tags_right; + set::iterator iter_left, iter_mid, iter_right; + vector > > para_matrix_new(N, vector >(N, vector(N, 0))); + MorphoStream morpho_stream(ftxt, true, &tdlsw); + + word = new TaggerWord(); // word for tags left + word->add_tag(eos, L"sent", tdlsw.getPreferRules()); + tags_left = word->get_tags(); // tags left + if (tags_left.size()==0) { //This is an unknown word + tags_left = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_left, *word, nw); + ++nw; + delete word; + word = morpho_stream.get_next_word(); // word for tags mid + tags_mid = word->get_tags(); // tags mid + if (tags_mid.size()==0) { //This is an unknown word + tags_mid = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_mid, *word, nw); + ++nw; + delete word; + if (morpho_stream.getEndOfFile()) { + return; + } + + word = morpho_stream.get_next_word(); // word for tags right + + while (word) { + if (++nw % 10000 == 0) { + wcerr << L'.' << flush; + } + + tags_right = word->get_tags(); // tags right + if (tags_right.size()==0) { //This is an unknown word + tags_right = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_right, *word, nw); + + double normalization = 0; + + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + normalization += tdlsw.getD()[*iter_left][*iter_mid][*iter_right]; + } + } + } + + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + if (normalization > ZERO) { + para_matrix_new[*iter_left][*iter_mid][*iter_right] += + tdlsw.getD()[*iter_left][*iter_mid][*iter_right] / normalization; + } + } + } + } + + tags_left = tags_mid; + tags_mid = tags_right; + delete word; + word = morpho_stream.get_next_word(); + } + + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + for (int k = 0; k < N; ++k) { + tdlsw.getD()[i][j][k] = para_matrix_new[i][j][k]; + } + } + } +} + +void +LSWPoST::print_para_matrix() { + wcout << L"para matrix D\n----------------------------\n"; + for (int i = 0; i < tdlsw.getN(); ++i) { + for (int j = 0; j < tdlsw.getN(); ++j) { + for (int k = 0; k < tdlsw.getN(); ++k) { + wcout << L"D[" << i << L"][" << j << L"][" << k << L"] = " + << tdlsw.getD()[i][j][k] << "\n"; + } + } + } +} + +void +LSWPoST::tagger(FILE *Input, FILE *Output, const bool &First) { + TaggerWord *word_left = NULL, *word_mid = NULL, *word_right = NULL; + set tags_left, tags_mid, tags_right; + set::iterator iter_left, iter_mid, iter_right; + MorphoStream morpho_stream(Input, debug, &tdlsw); + morpho_stream.setNullFlush(null_flush); + + word_left = new TaggerWord(); // word left + word_left->add_tag(eos, L"sent", tdlsw.getPreferRules()); + word_left->set_show_sf(show_sf); + tags_left = word_left->get_tags(); // tags left + + warn_absent_ambiguity_class(tdlsw, tags_left, *word_left, debug); + word_mid = morpho_stream.get_next_word(); // word mid + word_mid->set_show_sf(show_sf); + tags_mid = word_mid->get_tags(); // tags mid + + warn_absent_ambiguity_class(tdlsw, tags_mid, *word_mid, debug); + if (morpho_stream.getEndOfFile()) { + delete word_left; + delete word_mid; + return; + } + word_right = morpho_stream.get_next_word(); // word_right + word_right->set_show_sf(show_sf); + + wstring micad; + + while (word_right) { + tags_right = word_right->get_tags(); + warn_absent_ambiguity_class(tdlsw, tags_right, *word_right, debug); + + double max = -1; + TTag tag_max = *tags_mid.begin(); + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + double n = 0; + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + n += tdlsw.getD()[*iter_left][*iter_mid][*iter_right]; + } + } + if (n > max) { + max = n; + tag_max = *iter_mid; + } + } + + micad = word_mid->get_lexical_form(tag_max, (tdlsw.getTagIndex())[L"TAG_kEOF"]); + fputws_unlocked(micad.c_str(), Output); + if (morpho_stream.getEndOfFile()) { + if (null_flush) { + fputwc_unlocked(L'\0', Output); + } + fflush(Output); + morpho_stream.setEndOfFile(false); + } + + delete word_left; + word_left = word_mid; + tags_left = tags_mid; + word_mid = word_right; + tags_mid = tags_right; + word_right = morpho_stream.get_next_word(); + if (word_right != NULL) { + word_right->set_show_sf(show_sf); + } + } + delete word_left; + delete word_mid; +} Index: branches/apertium-tagger/apertium2/apertium/tagger_utils.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_utils.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_utils.cc (revision 69632) @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include + +#include +#include +#include +#include +#include +#ifdef _MSC_VER +#define wcstok wcstok_s +#endif +#ifdef __MINGW32__ + +wchar_t *_wcstok(wchar_t *wcs, const wchar_t *delim, wchar_t **ptr) { + (void)ptr; + return wcstok(wcs, delim); +} + +#define wcstok _wcstok +#endif + +using namespace Apertium; + + +void tagger_utils::fatal_error (wstring const &s) { + wcerr< v[], int l) { + for(int i=0; i0)&&(s.at(s.length()-1)==L' ')) + s.erase(s.length()-1,1); + if ((s.length()>0)&&(s.at(0)==L' ')) + s.erase(0,1); + + return s; +} + +void +tagger_utils::read_dictionary(FILE *fdic, TaggerData &td) { + int i, k, nw = 0; + TaggerWord *word = NULL; + set tags; + Collection &output = td.getOutput(); + + MorphoStream morpho_stream(fdic, true, &td); + + // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark + + word = morpho_stream.get_next_word(); + + while (word) { + if (++nw % 10000 == 0) + wcerr << L'.' << flush; + + tags = word->get_tags(); + + if (tags.size() > 0) + k = output[tags]; + + delete word; + word = morpho_stream.get_next_word(); + } + wcerr << L"\n"; + + // OPEN AMBIGUITY CLASS + // It contains all tags that are not closed. + // Unknown words are assigned the open ambiguity class + k = output[td.getOpenClass()]; + + // Create ambiguity class holding one single tag for each tag. + // If not created yet + int N = (td.getTagIndex()).size(); + for(i = 0; i != N; i++) { + set amb_class; + amb_class.insert(i); + k = output[amb_class]; + } +} + +set +tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { + set &ret = td.getOpenClass(); + Collection &output = td.getOutput(); + + for (int k=0; k &ambg_class = output[k]; + if (ambg_class.size() >= ret.size()) { + continue; + } + if (includes(ambg_class.begin(), ambg_class.end(), c.begin(), c.end())) { + ret = ambg_class; + } + } + return ret; +} + +void +tagger_utils::require_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, int nw) { + if (td.getOutput().has_not(tags)) { + wstring errors; + errors = L"A new ambiguity class was found. I cannot continue.\n"; + errors+= L"Word '" + word.get_superficial_form() + L"' not found in the dictionary.\n"; + errors+= L"New ambiguity class: " + word.get_string_tags() + L"\n"; + if (nw >= 0) { + std::wostringstream ws; + ws << (nw + 1); + errors+= L"Line number: " + ws.str() + L"\n"; + } + errors+= L"Take a look at the dictionary, then retrain."; + fatal_error(errors); + } +} + +static void _warn_absent_ambiguity_class(TaggerWord &word) { + wstring errors; + errors = L"A new ambiguity class was found. \n"; + errors += L"Retraining the tagger is necessary so as to take it into account.\n"; + errors += L"Word '" + word.get_superficial_form() + L"'.\n"; + errors += L"New ambiguity class: " + word.get_string_tags() + L"\n"; + wcerr << L"Error: " << errors; +} + +set +tagger_utils::require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags)) { + if (debug) { + _warn_absent_ambiguity_class(word); + } + return find_similar_ambiguity_class(td, tags); + } + return tags; +} + +void +tagger_utils::warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags) && debug) { + _warn_absent_ambiguity_class(word); + } +} + +template +ostream& operator<< (ostream& os, const map & f){ + typename map ::const_iterator it; + os<first<<' '<second; + return os; +} + +template +istream& operator>> (istream& is, map & f) { + int n, i, k; + f.clear(); + is>>n; + for (k=0; k>i; // warning: does not work if both + is>>f[i]; // lines merged in a single one + } + if (is.bad()) tagger_utils::fatal_error(L"reading map"); + return is; +} + +template +ostream& operator<< (ostream& os, const set& s) { + typename set::iterator it = s.begin(); + os<<'{'; + if (it!=s.end()) { + os<<*it; + while (++it!=s.end()) os<<','<<*it; + } + os<<'}'; + return os; +} + Index: branches/apertium-tagger/apertium2/apertium/tagger_utils.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_utils.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_utils.h (revision 69632) @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef __TAGGERUTILS_H +#define __TAGGERUTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace tagger_utils +{ +/** Print a fatal error message + * @param s the error message to print + */ +void fatal_error (wstring const &s); + +/** Print a fatal error message related to a file + * @param s the file name to be printted in the error message + */ +void file_name_error (string const &s); + +/** Convert from int to string + * @param i the int value to convert + * @return an string representing the number recived as input + */ +char *itoa(int i); + +/** Make all array positions equal to zero + * @param a the array + * @param l length of the array a + */ +void clear_array_double(double a[], int l); + +/** Clear all vectors stored in array v + * @param v array of vectors + * @param l length of the array v + */ +void clear_array_vector(vector v[], int l); + +/** Return the number of tokens in the multiword unit + */ + int ntokens_multiword(wstring const &s); + +/** Devuelve el nº de guiones que contiene la cadena pasada como argumento + */ +int nguiones_fs(wstring const &cadena); + +/** Reads the expanded dictionary received as a parameter puts the resulting + * ambiguity classes that the tagger will manage. + * @param fdic the input stream with the expanded dictionary to read + * @param td the tagger data instance to mutate + */ +void read_dictionary(FILE *fdic, TaggerData &td); + +/** This method returns a known ambiguity class that is a subset of +* the one received as a parameter. This is useful when a new +* ambiguity class is found because of changes in the morphological +* dictionary used by the MT system. +* @param c set of tags (ambiguity class) +* @return a known ambiguity class +*/ +set find_similar_ambiguity_class(TaggerData &td, set &c); + +/** Dies with an error message if the tags aren't in the tagger data */ +void require_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, int nw); + +/** As with find_similar_ambiguity_class, but returns tags if it's already fine + * & prints a warning if debug */ +set require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); + +/** Just prints a warning if debug */ +void warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); + +wstring trim(wstring s); + +}; + +template +ostream& operator<< (ostream& os, const map & f); +template +istream& operator>> (istream& is, map & f); +template +ostream& operator<< (ostream& os, const set& s); + +#endif Index: branches/apertium-tagger/apertium2/apertium/hmm.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/hmm.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/hmm.cc (revision 69632) @@ -0,0 +1,872 @@ + +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/* + * First order hidden Markov model (HMM) implementation (source) + * + * @author Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es + */ + +#include +#include +#include "apertium_config.h" +#include +#include + +#ifdef WIN32 +#define isnan(n) _isnan(n) +#define isinf(n) (!_finite(n)) +#endif + +#ifdef __clang__ +#undef __GNUC__ +#endif + +#include +#include +#include +#include +#include + +using namespace Apertium; +using namespace tagger_utils; + +void HMM::deserialise(FILE *Serialised_FILE_Tagger) { + tdhmm.read(Serialised_FILE_Tagger); + eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; +} + +std::vector &HMM::getArrayTags() { + return tdhmm.getArrayTags(); +} + +void HMM::serialise(FILE *Stream_) { tdhmm.write(Stream_); } + +void HMM::deserialise(const TaggerData &Deserialised_FILE_Tagger) { + tdhmm = TaggerDataHMM(Deserialised_FILE_Tagger); + eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; +} + +void HMM::init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *UntaggedCorpus) { + init_probabilities_from_tagged_text(TaggedCorpus, UntaggedCorpus); + apply_rules(); +} + +void HMM::init_probabilities_kupiec_(FILE *Corpus) { + init_probabilities_kupiec(Corpus); + apply_rules(); +} + +void HMM::train(FILE *Corpus, unsigned long Count) { + for (; Count > 0; --Count) { + std::fseek(Corpus, 0, SEEK_SET); + train(Corpus); + } + + apply_rules(); +} + +HMM::HMM() {} + +HMM::HMM(TaggerDataHMM tdhmm) +{ + tdhmm = tdhmm; + eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; +} + +HMM::HMM(TaggerDataHMM *tdhmm) : tdhmm(*tdhmm) {} + +HMM::~HMM() {} + +void +HMM::init() +{ +} + +void +HMM::set_eos(TTag t) +{ + eos = t; +} + +void +HMM::read_ambiguity_classes(FILE *in) +{ + while(in) + { + int ntags = Compression::multibyte_read(in); + + if(feof(in)) + { + break; + } + set ambiguity_class; + + for(; ntags != 0; ntags--) + { + ambiguity_class.insert(Compression::multibyte_read(in)); + } + + if(ambiguity_class.size() != 0) + { + tdhmm.getOutput().add(ambiguity_class); + } + } + + tdhmm.setProbabilities(tdhmm.getTagIndex().size(), tdhmm.getOutput().size()); +} + +void +HMM::write_ambiguity_classes(FILE *out) +{ + for(int i=0, limit = tdhmm.getOutput().size(); i != limit; i++) + { + set const &ac = (tdhmm.getOutput())[i]; + Compression::multibyte_write(ac.size(), out); + for(set::const_iterator it = ac.begin(), limit2 = ac.end(); + it != limit2; it++) + { + Compression::multibyte_write(*it, out); + } + } +} + +void +HMM::read_probabilities(FILE *in) +{ + tdhmm.read(in); +} + +void +HMM::write_probabilities(FILE *out) +{ + tdhmm.write(out); +} + +void +HMM::init_probabilities_kupiec (FILE *is) +{ + int N = tdhmm.getN(); + int M = tdhmm.getM(); + int i=0, j=0, k=0, k1=0, k2=0, nw=0; +#ifdef __GNUC__ + double classes_ocurrences[M]; //M = Number of ambiguity classes + double classes_pair_ocurrences[M][M]; + double tags_estimate[N]; //N = Number of tags (states) + double tags_pair_estimate[N][N]; +#else + vector classes_ocurrences (M, 1); + vector > classes_pair_ocurrences(M, vector(M, 1)); + vector tags_estimate(N, 0); + vector > tags_pair_estimate(N, vector(N, 0)); +#endif + + Collection &output = tdhmm.getOutput(); + + MorphoStream lexmorfo(is, true, &tdhmm); + + TaggerWord *word=NULL; + +#ifdef __GNUC__ + for(k=0; k tags; + tags.insert(eos); + k1=output[tags]; //The first tag (ambiguity class) seen is the end-of-sentence + + //We count for each ambiguity class the number of ocurrences + word = lexmorfo.get_next_word(); + while((word)) { + if (++nw%10000==0) wcerr<get_tags(); + + if (tags.size()==0) { //This is an unknown word + tags = tdhmm.getOpenClass(); + } + else { + require_ambiguity_class(tdhmm, tags, *word, nw); + } + + k2=output[tags]; + + classes_ocurrences[k1]++; + classes_pair_ocurrences[k1][k2]++; //k1 followed by k2 + delete word; + word=lexmorfo.get_next_word(); + + k1=k2; + + } + + //Estimation of the number of time each tags occurs in the training text + for(i=0; i tags1, tags2; + set::iterator itag1, itag2; + for(k1=0; k10) + (tdhmm.getA())[i][j] = tags_pair_estimate[i][j]/sum; + else { + (tdhmm.getA())[i][j] = 0; + } + } + } + + //b[i][k] estimation + for(i=0; i0) + (tdhmm.getB())[i][k] = (classes_ocurrences[k]/output[k].size())/tags_estimate[i]; + else + (tdhmm.getB())[i][k] = 0; + } + } + } + wcerr< > tags_pair(N, vector(N, 0)); + vector > emission(N, vector(M, 0)); +#endif + + + MorphoStream stream_tagged(ftagged, true, &tdhmm); + MorphoStream stream_untagged(funtagged, true, &tdhmm); + + TaggerWord *word_tagged=NULL, *word_untagged=NULL; + Collection &output = tdhmm.getOutput(); + + + set tags; + +#ifdef __GNUC__ + // Init counters - each event appears at least once. + // Espected likelihood estimate (ELE) with a fixed initial count of 1 + for(i=0; iget_superficial_form()!=word_untagged->get_superficial_form()) { + wcerr<get_tags().size()==0) // Unknown word + tag1 = -1; + else if (word_tagged->get_tags().size()>1) // Ambiguous word + wcerr<get_superficial_form()<get_tags()).begin(); + + + if ((tag1>=0) && (tag2>=0)) + tags_pair[tag2][tag1]++; + + + if (word_untagged->get_tags().size()==0) { // Unknown word + tags = tdhmm.getOpenClass(); + } + else { + require_ambiguity_class(tdhmm, word_untagged->get_tags(), *word_untagged, nw); + tags = word_untagged->get_tags(); + } + + k=output[tags]; + if(tag1>=0) + emission[tag1][k]++; + + delete word_tagged; + word_tagged=stream_tagged.get_next_word(); + delete word_untagged; + word_untagged=stream_untagged.get_next_word(); + } + + + //Estimate of a[i][j] + for(i=0; i &forbid_rules = tdhmm.getForbidRules(); + vector &enforce_rules = tdhmm.getEnforceRules(); + int N = tdhmm.getN(); + int i, j, j2; + bool found; + + for(i=0; i<(int) forbid_rules.size(); i++) { + (tdhmm.getA())[forbid_rules[i].tagi][forbid_rules[i].tagj] = ZERO; + } + + for(i=0; i<(int) enforce_rules.size(); i++) { + for(j=0; j0) + (tdhmm.getA())[i][j] = (tdhmm.getA())[i][j]/sum; + else + (tdhmm.getA())[i][j] = 0; + } + } +} + +void +HMM::read_dictionary(FILE *fdic) { + tagger_utils::read_dictionary(fdic, tdhmm); + int N = (tdhmm.getTagIndex()).size(); + int M = (tdhmm.getOutput()).size(); + wcerr << N << L" states and " << M < > ambiguity_classes; + MorphoStream morpho_stream(in, true, &tdhmm); + + TaggerWord *word = morpho_stream.get_next_word(); + + while(word) { + set tags = word->get_tags(); + if(tags.size() > 0) { + if(ambiguity_classes.find(tags) == ambiguity_classes.end()) { + ambiguity_classes.insert(tags); + word->outputOriginal(out); + //wcerr<get_string_tags()< tags, pretags; + set::iterator itag, jtag; + map gamma; + map ::iterator jt, kt; + map < int, map > alpha, beta, xsi, phi; + map < int, map >::iterator it; + double prob, loli; + vector < set > pending; + Collection &output = tdhmm.getOutput(); + + int ndesconocidas=0; + // alpha => forward probabilities + // beta => backward probabilities + + MorphoStream morpho_stream(ftxt, true, &tdhmm); + + loli = 0; + tag = eos; + tags.clear(); + tags.insert(tag); + pending.push_back(tags); + + alpha[0].clear(); + alpha[0][tag] = 1; + + word = morpho_stream.get_next_word(); + + while (word) { + + //wcerr<get_tags(); + + if (tags.size()==0) { // This is an unknown word + tags = tdhmm.getOpenClass(); + ndesconocidas++; + } + + require_ambiguity_class(tdhmm, tags, *word, nw); + + k = output[tags]; + len = pending.size(); + alpha[len].clear(); + + //Forward probabilities + for (itag=tags.begin(); itag!=tags.end(); itag++) { + i=*itag; + for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) { + j=*jtag; + //cerr<<"previous alpha["<1) { + pending.push_back(tags); + } else { // word is unambiguous + tag = *tags.begin(); + beta[0].clear(); + beta[0][tag] = 1; + + prob = alpha[len][tag]; + + //cerr<<"prob="<1) || ((tag!=eos)&&(tag != (tdhmm.getTagIndex())[L"TAG_kEOF"]))) + wcerr<first; + for (jt=xsi[i].begin(); jt!=xsi[i].end(); jt++) { + j = jt->first; + if (xsi[i][j]>0) { + if (gamma[i]==0) { + wcerr<first; + for (kt=phi[i].begin(); kt!=phi[i].end(); kt++) { + k = kt->first; + if (phi[i][k]>0) { + (tdhmm.getB())[i][k] = phi[i][k]/gamma[i]; + + if (isnan((tdhmm.getB())[i][k])) { + wcerr< ambg_class_tags, tags, pretags; + set ::iterator itag, jtag; + + double prob, loli, x; + int N = tdhmm.getN(); +#ifdef __GNUC__ + double alpha[2][N]; + vector best[2][N]; +#else + vector > alpha(2, vector(N)); + vector > > best(2, vector >(N)); +#endif + + vector wpend; + int nwpend; + + MorphoStream morpho_stream(Input, debug, &tdhmm); + morpho_stream.setNullFlush(null_flush); + + Collection &output = tdhmm.getOutput(); + + loli = nw = 0; + + //Initialization + tags.insert(eos); + alpha[0][eos] = 1; + + word = morpho_stream.get_next_word(); + + while (word) { + wpend.push_back(*word); + nwpend = wpend.size(); + + pretags = tags; // Tags from the previous word + + tags = word->get_tags(); + + if (tags.size()==0) // This is an unknown word + tags = tdhmm.getOpenClass(); + + ambg_class_tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug); + + k = output[ambg_class_tags]; //Ambiguity class the word belongs to + +#ifdef __GNUC__ + clear_array_double(alpha[nwpend%2], N); + clear_array_vector(best[nwpend%2], N); +#else + clear_array_double(&alpha[nwpend%2][0], N); + clear_array_vector(&best[nwpend%2][0], N); +#endif + + //Induction + for (itag=tags.begin(); itag!=tags.end(); itag++) { //For all tag from the current word + i=*itag; + for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) { //For all tags from the previous word + j=*jtag; + x = alpha[1-nwpend%2][j]*(tdhmm.getA())[j][i]*(tdhmm.getB())[i][k]; + if (alpha[nwpend%2][i]<=x) { + if (nwpend>1) + best[nwpend%2][i] = best[1-nwpend%2][j]; + best[nwpend%2][i].push_back(i); + alpha[nwpend%2][i] = x; + } + } + } + + //Backtracking + if (tags.size()==1) { + tag = *tags.begin(); + + prob = alpha[nwpend%2][tag]; + + if (prob>0) + loli -= log(prob); + else { + if (debug) + wcerr<get_superficial_form()<get_string_tags()<1)&&(debug)) { + wstring errors; + errors = L"The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n"; + errors+= L"This message should never appears. If you are reading this ..... these are very bad news.\n"; + wcerr< ambiguity_class; + set::iterator itag; + cout<<"AMBIGUITY CLASSES\n-------------------------------\n"; + for(int i=0; i != tdhmm.getM(); i++) { + ambiguity_class = (tdhmm.getOutput())[i]; + cout < "/dev/stderr" + guesswarned=1 + } + if(seen[filename]) { + print "apertium-createmodes.awk: "filename" seen twice" > "/dev/stderr" + filename = 0 + } + else { + print "" > filename + seen[filename] = 1 + } + next +} + +filename { + print $0 >> filename + close(filename) +} Property changes on: branches/apertium-tagger/apertium2/apertium/apertium-createmodes.awk ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/apertium-tagger/apertium2/apertium/modes.dtd =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes.dtd (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes.dtd (revision 69632) @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/modes.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes.rnc (revision 69632) @@ -0,0 +1,33 @@ +# Copyright (C) 2005-2016 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# DTD for the modes.xml file + +modes = element modes { attlist.modes, mode+ } +attlist.modes &= empty +mode = element mode { attlist.mode, pipeline } +attlist.mode &= attribute name { xsd:ID } +attlist.mode &= attribute install { text }? +attlist.mode &= attribute gendebug { text }? +pipeline = element pipeline { attlist.pipeline, program+ } +attlist.pipeline &= empty +program = element program { attlist.program, (file | arg)* } +attlist.program &= attribute name { text } +attlist.program &= attribute debug-suff { text }? +file = element file { attlist.file, empty } +attlist.file &= attribute name { text } +arg = element arg { attlist.arg, empty } +attlist.arg &= attribute name { text } +start = modes Index: branches/apertium-tagger/apertium2/apertium/modes.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes.rng (revision 69632) @@ -0,0 +1,106 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/modes2bash.xsl =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes2bash.xsl (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes2bash.xsl (revision 69632) @@ -0,0 +1,99 @@ + + + + + + + + + + + + + + + + + + + +# + + .mode + + + + + + + + ' + + ' + + +# modes/ + + .mode + + + + + + + + + + + + | + + + + + + + + + + + + + + + + + + + + + + + + + ' + + / + + ' + + + + Index: branches/apertium-tagger/apertium2/apertium/modes2debugmodes.xsl =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes2debugmodes.xsl (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes2debugmodes.xsl (revision 69632) @@ -0,0 +1,167 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + **************** + + : **************** + + + + + + + + + + + + + - + + + -disam + + + -tagger + + + -pretransfer + + + -lex + + + -chunker + + + -interchunk + + + -postchunk + + + -dgen + + + -biltrans + + + -pgen + + + -morph + + + -morph + + + -NAMEME + + + + + + + + + + -t + + + -t + + + -t + + + -t + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/apertium_gen_wlist_lextor_translation.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_gen_wlist_lextor_translation.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_gen_wlist_lextor_translation.cc (revision 69632) @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante + * + * author: Felipe Sánchez-Martínez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include "getopt_long.h" +#include + +#include + +#include +#include +#include + +using namespace Apertium; +using namespace std; + + +void help(char *name) { + wcerr<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + if(monodic_file=="") { + wcerr<\n"; + for (int i=0; i. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "getopt_long.h" + +#ifdef _MSC_VER +#include +#include +#endif + +using namespace Apertium; +using namespace std; + +void message(char *progname) +{ + cerr << "USAGE: " << basename(progname) << " [-tz] t2x preproc [input [output]]" << endl; + cerr << " t2x t2x rules file" << endl; + cerr << " preproc result of preprocess trules file" << endl; + cerr << " input input file, standard input by default" << endl; + cerr << " output output file, standard output by default" << endl; + cerr << "OPTIONS" <. + */ + +#include +#include +#include "getopt_long.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace Apertium; + +#define MODE_TRAINWRD 0 +#define MODE_TRAINLCH 1 +#define MODE_LEXTOR 2 +#define MODE_LEXTORTL 3 + +using namespace std; + + +void help(char *name) { + cerr<<"USAGE:\n"; + cerr<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + if (weight_exponent<0) { + wcerr<. + */ + +#include +#include +#include "getopt_long.h" + +#include + +#include +#include +#include +#include +#include +#include + +using namespace Apertium; +#define MODE_LEXTOR 1 +#define MODE_LEXTORTL 2 + +using namespace std; + + +void help(char *name) { + cerr<<"USAGE:\n"; + cerr<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + cerr<<"TH ANGLE: "<. + */ +#include +#include + +#include +#include "getopt_long.h" +#include +#include +#include +#include +#include +#include +#ifdef _MSC_VER +#include +#include +#endif + +using namespace Apertium; +using namespace std; + +void message(char *progname) +{ + cerr << "USAGE: " << basename(progname) << " [-z] t3x preproc [input [output]]" << endl; + cerr << " t3x t3x rules file" << endl; + cerr << " preproc result of preprocess trules file" << endl; + cerr << " input input file, standard input by default" << endl; + cerr << " output output file, standard output by default" << endl; + cerr << "OPTIONS" <. + */ +#include +#include +#include +#include +#include +#include "getopt_long.h" + +#include +#include "apertium_config.h" +#include + +#ifdef _MSC_VER +#include +#include +#endif +#include + +using namespace Apertium; +using namespace std; + +bool compound_sep = false; + +void readAndWriteUntil(FILE *input, FILE *output, int const charcode) +{ + int mychar; + + while((mychar = fgetwc_unlocked(input)) != charcode) + { + if(feof(input)) + { + wcerr << L"ERROR: Unexpected EOF" << endl; + exit(EXIT_FAILURE); + } + fputwc_unlocked(mychar, output); + if(mychar == L'\\') + { + mychar = fgetwc(input); + fputwc(mychar, output); + } + } +} + +void procWord(FILE *input, FILE *output, bool surface_forms) +{ + int mychar; + wstring buffer = L""; + + bool buffer_mode = false; + bool in_tag = false; + bool queuing = false; + + if(surface_forms) + { + while((mychar = fgetwc_unlocked(input)) != L'/') ; + } + + while((mychar = fgetwc_unlocked(input)) != L'$') + { + if(feof(input)) + { + wcerr << L"ERROR: Unexpected EOF" << endl; + exit(EXIT_FAILURE); + } + + switch(mychar) + { + case L'<': + in_tag = true; + if(!buffer_mode) + { + buffer_mode = true; + } + break; + + case L'>': + in_tag = false; + break; + + case L'#': + if(buffer_mode) + { + buffer_mode = false; + queuing = true; + } + break; + } + + if(buffer_mode) + { + if((mychar != L'+' || (mychar == L'+' && in_tag == true)) && + (mychar != L'~' || (mychar == L'~' && in_tag == true))) + { + buffer += static_cast(mychar); + } + else if(in_tag == false && mychar == L'+') + { + buffer.append(L"$ ^"); + } + else if(in_tag == false && mychar == L'~' and compound_sep == true) + { + buffer.append(L"$^"); + } + } + else + { + if(mychar == L'+' && queuing == true) + { + buffer.append(L"$ ^"); + buffer_mode = true; + } + else + { + fputwc_unlocked(mychar, output); + } + } + + } + fputws_unlocked(buffer.c_str(), output); +} + +void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms) +{ + while(true) + { + int mychar = fgetwc_unlocked(input); + if(feof(input)) + { + break; + } + switch(mychar) + { + case L'[': + fputwc_unlocked(L'[', output); + readAndWriteUntil(input, output, L']'); + fputwc_unlocked(L']', output); + break; + + case L'\\': + fputwc_unlocked(mychar, output); + fputwc_unlocked(fgetwc_unlocked(input), output); + break; + + case L'^': + fputwc_unlocked(mychar, output); + procWord(input, output, surface_forms); + fputwc_unlocked(L'$', output); + break; + + case L'\0': + fputwc_unlocked(mychar, output); + + if(null_flush) + { + fflush(output); + } + break; + + default: + fputwc_unlocked(mychar, output); + break; + } + } +} + +void usage(char *progname) +{ + wcerr << L"USAGE: " << basename(progname) << L" [input_file [output_file]]" << endl; + exit(EXIT_FAILURE); +} + + + + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + bool null_flush = false; + bool surface_forms = false; + + int option_index=0; + + while (true) { + static struct option long_options[] = + { + {"null-flush", no_argument, 0, 'z'}, + {"no-surface-forms", no_argument, 0, 'n'}, + {"compounds", no_argument, 0, 'e'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int c=getopt_long(argc, argv, "enzh", long_options, &option_index); + if (c==-1) + break; + + switch (c) + { + case 'z': + null_flush = true; + break; + + case 'e': + compound_sep = true; + break; + + case 'n': + surface_forms = true; + break; + + case 'h': + default: + usage(argv[0]); + break; + } + } + + if((argc-optind+1) > 3) + { + usage(argv[0]); + } + + FILE *input, *output; + + if((argc-optind+1) == 1) + { + input = stdin; + output = stdout; + } + else if ((argc-optind+1) == 2) + { + input = fopen(argv[argc-1], "r"); + if(!input) + { + usage(argv[0]); + } + output = stdout; + } + else + { + input = fopen(argv[argc-2], "r"); + output = fopen(argv[argc-1], "w"); + + if(!input || !output) + { + usage(argv[0]); + } + } + + if(feof(input)) + { + wcerr << L"ERROR: Can't read file '" << argv[1] << L"'" << endl; + exit(EXIT_FAILURE); + } + +#ifdef _MSC_VER + _setmode(_fileno(input), _O_U8TEXT); + _setmode(_fileno(output), _O_U8TEXT); +#endif + + processStream(input, output, null_flush, surface_forms); +} Index: branches/apertium-tagger/apertium2/apertium/apertium_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_tagger.h (revision 69632) @@ -0,0 +1,88 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef APERTIUM_TAGGER_H +#define APERTIUM_TAGGER_H + +#include "apertium_config.h" + +#include "basic_stream_tagger.h" +#include "basic_stream_tagger_trainer.h" +#include "basic_tagger.h" +#include "constructor_eq_delete.h" +#include "file_tagger.h" +#include "optional.h" + +#include "getopt_long.h" +#include + +namespace Apertium { +class apertium_tagger : private constructor_eq_delete { +public: + apertium_tagger(int &argc, char **&argv); + +private: + enum FunctionTypeType { Unigram, SlidingWindow }; + enum UnigramType { Stream_5_3_1, Stream_5_3_2, Stream_5_3_3 }; + enum FunctionType { Tagger, Retrain, Supervised, Train }; + static void help(); + + + static std::string option_string(const int &indexptr_); + static std::string option_string(const struct option &option_); + + + static void locale_global_(); + + + static const struct option longopts[]; + + + + void set_indexptr(); + + + void flagOptionCase(bool (basic_Tagger::Flags::*GetFlag)() const, + void (basic_Tagger::Flags::*SetFlag)(const bool &)); + std::string option_string(); + void functionTypeTypeOptionCase(const FunctionTypeType &FunctionTypeType_); + void functionTypeOptionCase(const FunctionType &FunctionType_); + void getIterationsArgument(); + unsigned long optarg_unsigned_long() const; + void g_StreamTagger(basic_StreamTagger &StreamTagger_); + void s_StreamTaggerTrainer(basic_StreamTaggerTrainer &StreamTaggerTrainer_); + void g_FILE_Tagger(FILE_Tagger &FILE_Tagger_); + void r_FILE_Tagger(FILE_Tagger &FILE_Tagger_); + void s_FILE_Tagger(FILE_Tagger &FILE_Tagger_); + void t_FILE_Tagger(FILE_Tagger &FILE_Tagger_); + int &argc; + char **&argv; + int The_val; + + + int The_indexptr; + Optional FunctionTypeTypeOption_indexptr; + Optional FunctionTypeOption_indexptr; + + + Optional TheFunctionTypeType; + Optional TheUnigramType; + Optional TheFunctionType; + unsigned long TheFunctionTypeOptionArgument; + basic_Tagger::Flags TheFlags; +}; +} + +#endif // APERTIUM_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/apertium_tagger_apply_new_rules.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_tagger_apply_new_rules.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_tagger_apply_new_rules.cc (revision 69632) @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2004-2006 Felipe Sánchez-Martínez + * Copyright (C) 2006 Universitat d'Alacant + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include "getopt_long.h" + +#include +#include +#include +#include + +using namespace Apertium; + +using namespace std; + +//Global vars +TaggerDataHMM tagger_data_hmm; +TTag eos; //End-of-sentence tag + +void check_file(FILE *f, const string& path) { + if (!f) { + cerr<<"Error: cannot open file '"<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + //Now we check the command line arguments + if (filein=="") { + cerr<<"Error: You did not provide an input file (.prob). Use --filein to do that\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + if (fileout=="") { + cerr<<"Error: You did not provide an output file (.prob). Use --fileout to do that\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + if (filetsx=="") { + cerr<<"Error: You did not provide a tagger definition file (.tsx). Use --filetsx to do that\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + FILE *fin, *fout; + + fin=fopen(filein.c_str(), "rb"); + check_file(fin, filein); + + cerr<<"Reading apertium-tagger data from file '"<. + */ + +/* +#include +#include +#include +#include +#include +*/ + +#include "getopt_long.h" +#include +#include +#include +#include +#include +#include + +#include +#include + + +using namespace std; + +//Global vars +TaggerDataHMM tagger_data_hmm; +bool check_ambclasses; + +void check_file(FILE *f, const string& path) { + if (!f) { + cerr<<"Error: cannot open file '"<get_superficial_form())<<" "<get_string_tags())<<"\n"; + + if (check_ambclasses) { + int k=tagger_data_hmm.getOutput()[word->get_tags()]; + + if ((k>=tagger_data_hmm.getM())||(k<0)) { + cerr<<"Error: Ambiguity class number out of range: "<get_superficial_form())<<"\n"; + cerr<<"Ambiguity class: "<get_string_tags())<<"\n"; + } + } + + delete word; + + if ((corpus_length>0) && (nwords>=corpus_length)) + break; + + word=lexmorfo.get_next_word(); + } + cerr<] < file.crp \n\n"; + + cerr<<"ARGUMENTS: \n" + <<" --tsxfile|-x: Specify a tagger specification file\n" + <<" --probfile|-p: Specify a tagger parameter file\n" + <<" --clength|-l: Specify the length of the corpus to process\n"; +} + + +int main(int argc, char* argv[]) { + string tsxfile=""; + string probfile=""; + int corpus_length=-1; + + int c; + int option_index=0; + + cerr<<"LOCALE: "<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + if((tsxfile=="") && (probfile=="")) { + cerr<<"Error: You have provided neither a tagger specification file (.tsx) nor a tagger probability file (.prob). Use --tsxfile or --probfile to provide one of them\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + if((tsxfile!="") && (probfile!="")) { + cerr<<"Error: You provided a tagger specification file and a tagger probability file. Only one of them can be provided, not both\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + if (tsxfile!="") { + cerr<<"Reading tagger specification from file '"<. + */ +#include +#include "getopt_long.h" +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "apertium_config.h" +#include + +using namespace Apertium; +using namespace std; + +void usage(char *progname) +{ + wcerr << L"USAGE: " << basename(progname) << L" [options] code1 code2 doc1 doc2 [output_file]" << endl; + wcerr << L"Options:" << endl; + wcerr << L" -p percent number 0 < n <= 1 to set margin of confidence of TU's " << endl; + wcerr << L" (0.85 by default) in length terms" << endl; + wcerr << L" -e edit number 0 < n <= 1 to set margin of confidence of TU's " << endl; + wcerr << L" (0.30 by default) in edit distance terms" << endl; + wcerr << L" -l low-limit ignore percent if the segment is less than lowlimit" < 1) + { + usage(argv[0]); + } + break; + case 'e': + edit_distance_percent = strtod(optarg, NULL); + if(edit_distance_percent <= 0 || edit_distance_percent > 1) + { + usage(argv[0]); + } + break; + + case 'l': + low_limit = atoi(optarg); + if(low_limit < 0) + { + usage(argv[0]); + } + break; + + case 'm': + max_edit = atoi(optarg); + if(max_edit < 0) + { + usage(argv[0]); + } + break; + + case 'd': + diagonal_width = atoi(optarg); + if(diagonal_width < 0) + { + usage(argv[0]); + } + break; + + case 'w': + window_size = atoi(optarg); + if(window_size < 0) + { + usage(argv[0]); + } + break; + + case 's': + step = atoi(optarg); + if(step < 0) + { + usage(argv[0]); + } + break; + + case 't': + translation = optarg; + break; + + + default: + //wcerr<. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "getopt_long.h" +#ifdef _MSC_VER +#include +#include +#endif + +using namespace Apertium; +using namespace std; + +void message(char *progname) +{ + cerr << "USAGE: " << basename(progname) << " trules preproc biltrans [input [output]]" << endl; + cerr << " " << basename(progname) << " -b trules preproc [input [output]]" << endl; + cerr << " " << basename(progname) << " -n trules preproc [input [output]]" << endl; + cerr << " " << basename(progname) << " -x extended trules preproc biltrans [input [output]]" << endl; + cerr << " " << basename(progname) << " -c trules preproc biltrans [input [output]]" << endl; + cerr << " " << basename(progname) << " -t trules preproc biltrans [input [output]]" << endl; + cerr << " trules transfer rules file" << endl; + cerr << " preproc result of preprocess trules file" << endl; + cerr << " biltrans bilingual letter transducer file" << endl; + cerr << " input input file, standard input by default" << endl; + cerr << " output output file, standard output by default" << endl; + cerr << " -b input from lexical transfer" << endl; + cerr << " -n don't use bilingual dictionary" << endl; + cerr << " -x bindix extended mode with user dictionary" << endl; + cerr << " -c case-sensitiveness while accessing bilingual dictionary" << endl; + cerr << " -t trace (show rule numbers and patterns matched)" << endl; + cerr << " -T trace, for apertium-transfer-tools (also sets -t)" << endl; + cerr << " -z null-flushing output on '\0'" << endl; + cerr << " -h shows this message" << endl; + + + exit(EXIT_FAILURE); +} + +void testfile(string const &filename) +{ + struct stat mybuf; + if(stat(filename.c_str(), &mybuf) == -1) + { + cerr << "Error: can't stat file '"; + cerr << filename << "'." << endl; + exit(EXIT_FAILURE); + } +} + +FILE * open_input(string const &filename) +{ + FILE *input = fopen(filename.c_str(), "r"); + if(!input) + { + cerr << "Error: can't open input file '"; + cerr << filename.c_str() << "'." << endl; + exit(EXIT_FAILURE); + } + + return input; +} + +FILE * open_output(string const &filename) +{ + FILE *output = fopen(filename.c_str(), "w"); + if(!output) + { + cerr << "Error: can't open output file '"; + cerr << filename.c_str() << "'." << endl; + exit(EXIT_FAILURE); + } + return output; +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + + Transfer t; + + int option_index=0; + + while (true) { + static struct option long_options[] = + { + {"from-bilingual", no_argument, 0, 'b'}, + {"no-bilingual", no_argument, 0, 'n'}, + {"extended", required_argument, 0, 'x'}, + {"case-sensitive", no_argument, 0, 'c'}, + {"null-flush", no_argument, 0, 'z'}, + {"trace", no_argument, 0, 't'}, + {"trace_att", no_argument, 0, 'T'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int c=getopt_long(argc, argv, "nbx:cztTh", long_options, &option_index); + if (c==-1) + break; + + switch (c) + { + case 'b': + t.setPreBilingual(true); + t.setUseBilingual(false); + break; + + case 'n': + t.setUseBilingual(false); + break; + + case 'x': + t.setExtendedDictionary(optarg); + break; + + case 'c': + t.setCaseSensitiveness(true); + break; + + case 't': + t.setTrace(true); + break; + + case 'T': + t.setTrace(true); + t.setTraceATT(true); + break; + + case 'z': + t.setNullFlush(true); + break; + + case 'h': + default: + message(argv[0]); + break; + } + } + + FILE *input = stdin, *output = stdout; + + switch(argc - optind + 1) + { + case 6: + output = open_output(argv[argc-1]); + input = open_input(argv[argc-2]); + testfile(argv[argc-3]); + testfile(argv[argc-4]); + testfile(argv[argc-5]); + t.read(argv[argc-5], argv[argc-4], argv[argc-3]); + break; + + case 5: + if(t.getUseBilingual() == false || t.getPreBilingual() == true) + { + output = open_output(argv[argc-1]); + input = open_input(argv[argc-2]); + testfile(argv[argc-3]); + testfile(argv[argc-4]); + t.read(argv[argc-4], argv[argc-3]); + } + else + { + input = open_input(argv[argc-1]); + testfile(argv[argc-2]); + testfile(argv[argc-3]); + testfile(argv[argc-4]); + t.read(argv[argc-4], argv[argc-3], argv[argc-2]); + } + break; + + case 4: + if(t.getUseBilingual() == false || t.getPreBilingual() == true) + { + input = open_input(argv[argc-1]); + testfile(argv[argc-2]); + testfile(argv[argc-3]); + t.read(argv[argc-3], argv[argc-2]); + } + else + { + testfile(argv[argc-1]); + testfile(argv[argc-2]); + testfile(argv[argc-3]); + t.read(argv[argc-3], argv[argc-2], argv[argc-1]); + } + break; + case 3: + if(t.getUseBilingual() == false || t.getPreBilingual() == true) + { + testfile(argv[argc-1]); + testfile(argv[argc-2]); + t.read(argv[argc-2], argv[argc-1]); + } + else + { + message(argv[0]); + } + break; + + default: + message(argv[0]); + break; + } + +#ifdef _MSC_VER + _setmode(_fileno(input), _O_U8TEXT); + _setmode(_fileno(output), _O_U8TEXT); +#endif + + t.transfer(input, output); + return EXIT_SUCCESS; +} Index: branches/apertium-tagger/apertium2/apertium/getopt_long.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/getopt_long.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/getopt_long.c (revision 69632) @@ -0,0 +1,1236 @@ +/* + * THIS IS NOT A CLEAN COPY OF GETOPT.C AND GETOPT1.C + * + * Implementation of getopt_long, cobbled together from getopt.c and + * getopt1.c from the GNU binutils distribution. This is more-or-less + * getopt.c inserted into getopt1.c, with the definition of getopt() + * commented out. + * + * Need to ifdef out optarg, optind, opterr, optopt, to handle the + * case where these are already defined for the benefit of system + * getopt() + * + * No, it's not pretty. + */ + +/* getopt_long and getopt_long_only entry points for GNU getopt. + Copyright (C) 1987,88,89,90,91,92,93,94,96,97,98 + Free Software Foundation, Inc. + + NOTE: This source is derived from an old version taken from the GNU C + Library (glibc). + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + USA. */ + +#include + +#ifndef HAVE_GETOPT_LONG +/* We shouldn't be compiling this module in this case, but we clearly + are (damned configuration tools!), so avoid messing up. */ + +#include "getopt_long.h" +/* See getopt_long.h for discussion of THIS_IS__STDC__ */ + + +#if !defined THIS_IS__STDC__ || !THIS_IS__STDC__ +/* This is a separate conditional since some stdc systems + reject `defined (const)'. */ +#ifndef const +#define const +#endif +#endif + +#include + + + +/* ******************** getopt.c ******************** */ +/* Getopt for GNU. + NOTE: getopt is now part of the C library, so if you don't know what + "Keep this file name-space clean" means, talk to drepper@gnu.org + before changing it! + + Copyright (C) 1987, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98 + Free Software Foundation, Inc. + + NOTE: This source is derived from an old version taken from the GNU C + Library (glibc). + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + USA. */ + +/* This tells Alpha OSF/1 not to define a getopt prototype in . + Ditto for AIX 3.2 and . */ +#ifndef _NO_PROTO +# define _NO_PROTO +#endif + + +#if !defined THIS_IS__STDC__ || !THIS_IS__STDC__ +/* This is a separate conditional since some stdc systems + reject `defined (const)'. */ +# ifndef const +# define const +# endif +#endif + +#include + +/* Comment out all this code if we are using the GNU C Library, and are not + actually compiling the library itself. This code is part of the GNU C + Library, but also included in many other GNU distributions. Compiling + and linking in this code is a waste when using the GNU C library + (especially if it is a shared library). Rather than having every GNU + program understand `configure --with-gnu-libc' and omit the object files, + it is simpler to just do this in the source for each such file. */ + +#define GETOPT_INTERFACE_VERSION 2 +#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 +# include +# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION +# define ELIDE_CODE +# endif +#endif + +#ifndef ELIDE_CODE + + +/* This needs to come after some library #include + to get __GNU_LIBRARY__ defined. */ +#ifdef __GNU_LIBRARY__ +/* Don't include stdlib.h for non-GNU C libraries because some of them + contain conflicting prototypes for getopt. */ +# include +# include +#endif /* GNU C library. */ + +#ifdef VMS +# include +# if HAVE_STRING_H - 0 +# include +# endif +#endif + +#ifndef _ +/* This is for other GNU distributions with internationalized messages. + When compiling libc, the _ macro is predefined. */ +# ifdef HAVE_LIBINTL_H +# include +# define _(msgid) gettext (msgid) +# else +# define _(msgid) (msgid) +# endif +#endif + +/* This version of `getopt' appears to the caller like standard Unix `getopt' + but it behaves differently for the user, since it allows the user + to intersperse the options with the other arguments. + + As `getopt' works, it permutes the elements of ARGV so that, + when it is done, all the options precede everything else. Thus + all application programs are extended to handle flexible argument order. + + Setting the environment variable POSIXLY_CORRECT disables permutation. + Then the behavior is completely standard. + + GNU application programs can use a third alternative mode in which + they can distinguish the relative order of options and other arguments. */ + + + +/* Define HAVE_GETOPT if the getopt function (and thus, which is more + * important to us, the getopt globals, optarg, optind, opterr and + * optopt) is defined by the system. Leave undefined if they should be + * defined here instead. + */ +#ifndef HAVE_GETOPT + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +char *optarg = NULL; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +/* 1003.2 says this must be 1 before any call. */ +int optind = 1; + +/* Callers store zero here to inhibit the error message + for unrecognized options. */ + +int opterr = 1; + +/* Set to an option character which was unrecognized. + This must be initialized on some systems to avoid linking in the + system's own getopt implementation. */ + +int optopt = '?'; + +#endif /* #ifndef HAVE_GETOPT */ + +/* Formerly, initialization of getopt depended on optind==0, which + causes problems with re-calling getopt as programs generally don't + know that. */ + +int __getopt_initialized = 0; + +/* The next char to be scanned in the option-element + in which the last option character we returned was found. + This allows us to pick up the scan where we left off. + + If this is zero, or a null string, it means resume the scan + by advancing to the next ARGV-element. */ + +static char *nextchar; + +/* Describe how to deal with options that follow non-option ARGV-elements. + + If the caller did not specify anything, + the default is REQUIRE_ORDER if the environment variable + POSIXLY_CORRECT is defined, PERMUTE otherwise. + + REQUIRE_ORDER means don't recognize them as options; + stop option processing when the first non-option is seen. + This is what Unix does. + This mode of operation is selected by either setting the environment + variable POSIXLY_CORRECT, or using `+' as the first character + of the list of option characters. + + PERMUTE is the default. We permute the contents of ARGV as we scan, + so that eventually all the non-options are at the end. This allows options + to be given in any order, even with programs that were not written to + expect this. + + RETURN_IN_ORDER is an option available to programs that were written + to expect options and other ARGV-elements in any order and that care about + the ordering of the two. We describe each non-option ARGV-element + as if it were the argument of an option with character code 1. + Using `-' as the first character of the list of option characters + selects this mode of operation. + + The special argument `--' forces an end of option-scanning regardless + of the value of `ordering'. In the case of RETURN_IN_ORDER, only + `--' can cause `getopt' to return -1 with `optind' != ARGC. */ + +static enum +{ + REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER +} ordering; + +/* Value of POSIXLY_CORRECT environment variable. */ +static char *posixly_correct; + +#ifdef __GNU_LIBRARY__ +/* We want to avoid inclusion of string.h with non-GNU libraries + because there are many ways it can cause trouble. + On some systems, it contains special magic macros that don't work + in GCC. */ +# include +# define my_index strchr +#else + +# if HAVE_STRING_H +# include +# else +# if HAVE_STRINGS_H +# include +# endif +# endif + +/* Avoid depending on library functions or files + whose names are inconsistent. */ + +#ifndef getenv +extern char *getenv (); +#endif + +static char * +my_index (str, chr) + const char *str; + int chr; +{ + while (*str) + { + if (*str == chr) + return (char *) str; + str++; + } + return 0; +} + +/* If using GCC, we can safely declare strlen this way. + If not using GCC, it is ok not to declare it. */ +#ifdef __GNUC__ +/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. + That was relevant to code that was here before. */ +# if (!defined THIS_IS__STDC__ || !THIS_IS__STDC__) && !defined strlen +/* gcc with -traditional declares the built-in strlen to return int, + and has done so at least since version 2.4.5. -- rms. */ +extern int strlen (const char *); +# endif /* not THIS_IS__STDC__ */ +#endif /* __GNUC__ */ + +#endif /* not __GNU_LIBRARY__ */ + +/* Handle permutation of arguments. */ + +/* Describe the part of ARGV that contains non-options that have + been skipped. `first_nonopt' is the index in ARGV of the first of them; + `last_nonopt' is the index after the last of them. */ + +static int first_nonopt; +static int last_nonopt; + +#ifdef _LIBC +/* Bash 2.0 gives us an environment variable containing flags + indicating ARGV elements that should not be considered arguments. */ + +/* Defined in getopt_init.c */ +extern char *__getopt_nonoption_flags; + +static int nonoption_flags_max_len; +static int nonoption_flags_len; + +static int original_argc; +static char *const *original_argv; + +/* Make sure the environment variable bash 2.0 puts in the environment + is valid for the getopt call we must make sure that the ARGV passed + to getopt is that one passed to the process. */ +static void +__attribute__ ((unused)) +store_args_and_env (int argc, char *const *argv) +{ + /* XXX This is no good solution. We should rather copy the args so + that we can compare them later. But we must not use malloc(3). */ + original_argc = argc; + original_argv = argv; +} +# ifdef text_set_element +text_set_element (__libc_subinit, store_args_and_env); +# endif /* text_set_element */ + +# define SWAP_FLAGS(ch1, ch2) \ + if (nonoption_flags_len > 0) \ + { \ + char __tmp = __getopt_nonoption_flags[ch1]; \ + __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ + __getopt_nonoption_flags[ch2] = __tmp; \ + } +#else /* !_LIBC */ +# define SWAP_FLAGS(ch1, ch2) +#endif /* _LIBC */ + +/* Exchange two adjacent subsequences of ARGV. + One subsequence is elements [first_nonopt,last_nonopt) + which contains all the non-options that have been skipped so far. + The other is elements [last_nonopt,optind), which contains all + the options processed since those non-options were skipped. + + `first_nonopt' and `last_nonopt' are relocated so that they describe + the new indices of the non-options in ARGV after they are moved. */ + +#if defined THIS_IS__STDC__ && THIS_IS__STDC__ +static void exchange (char **); +#endif + +static void +exchange (argv) + char **argv; +{ + int bottom = first_nonopt; + int middle = last_nonopt; + int top = optind; + char *tem; + + /* Exchange the shorter segment with the far end of the longer segment. + That puts the shorter segment into the right place. + It leaves the longer segment in the right place overall, + but it consists of two parts that need to be swapped next. */ + +#ifdef _LIBC + /* First make sure the handling of the `__getopt_nonoption_flags' + string can work normally. Our top argument must be in the range + of the string. */ + if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) + { + /* We must extend the array. The user plays games with us and + presents new arguments. */ + char *new_str = malloc (top + 1); + if (new_str == NULL) + nonoption_flags_len = nonoption_flags_max_len = 0; + else + { + memset (__mempcpy (new_str, __getopt_nonoption_flags, + nonoption_flags_max_len), + '\0', top + 1 - nonoption_flags_max_len); + nonoption_flags_max_len = top + 1; + __getopt_nonoption_flags = new_str; + } + } +#endif + + while (top > middle && middle > bottom) + { + if (top - middle > middle - bottom) + { + /* Bottom segment is the short one. */ + int len = middle - bottom; + register int i; + + /* Swap it with the top part of the top segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[top - (middle - bottom) + i]; + argv[top - (middle - bottom) + i] = tem; + SWAP_FLAGS (bottom + i, top - (middle - bottom) + i); + } + /* Exclude the moved bottom segment from further swapping. */ + top -= len; + } + else + { + /* Top segment is the short one. */ + int len = top - middle; + register int i; + + /* Swap it with the bottom part of the bottom segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[middle + i]; + argv[middle + i] = tem; + SWAP_FLAGS (bottom + i, middle + i); + } + /* Exclude the moved top segment from further swapping. */ + bottom += len; + } + } + + /* Update records for the slots the non-options now occupy. */ + + first_nonopt += (optind - last_nonopt); + last_nonopt = optind; +} + +/* Initialize the internal data when the first call is made. */ + +#if defined THIS_IS__STDC__ && THIS_IS__STDC__ +static const char *_getopt_initialize (int, char *const *, const char *); +#endif +static const char * +_getopt_initialize (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + /* Start processing options with ARGV-element 1 (since ARGV-element 0 + is the program name); the sequence of previously skipped + non-option ARGV-elements is empty. */ + + first_nonopt = last_nonopt = optind; + + nextchar = NULL; + + posixly_correct = getenv ("POSIXLY_CORRECT"); + + /* Determine how to handle the ordering of options and nonoptions. */ + + if (optstring[0] == '-') + { + ordering = RETURN_IN_ORDER; + ++optstring; + } + else if (optstring[0] == '+') + { + ordering = REQUIRE_ORDER; + ++optstring; + } + else if (posixly_correct != NULL) + ordering = REQUIRE_ORDER; + else + ordering = PERMUTE; + +#ifdef _LIBC + if (posixly_correct == NULL + && argc == original_argc && argv == original_argv) + { + if (nonoption_flags_max_len == 0) + { + if (__getopt_nonoption_flags == NULL + || __getopt_nonoption_flags[0] == '\0') + nonoption_flags_max_len = -1; + else + { + const char *orig_str = __getopt_nonoption_flags; + int len = nonoption_flags_max_len = strlen (orig_str); + if (nonoption_flags_max_len < argc) + nonoption_flags_max_len = argc; + __getopt_nonoption_flags = + (char *) malloc (nonoption_flags_max_len); + if (__getopt_nonoption_flags == NULL) + nonoption_flags_max_len = -1; + else + memset (__mempcpy (__getopt_nonoption_flags, orig_str, len), + '\0', nonoption_flags_max_len - len); + } + } + nonoption_flags_len = nonoption_flags_max_len; + } + else + nonoption_flags_len = 0; +#endif + + return optstring; +} + +/* Scan elements of ARGV (whose length is ARGC) for option characters + given in OPTSTRING. + + If an element of ARGV starts with '-', and is not exactly "-" or "--", + then it is an option element. The characters of this element + (aside from the initial '-') are option characters. If `getopt' + is called repeatedly, it returns successively each of the option characters + from each of the option elements. + + If `getopt' finds another option character, it returns that character, + updating `optind' and `nextchar' so that the next call to `getopt' can + resume the scan with the following option character or ARGV-element. + + If there are no more option characters, `getopt' returns -1. + Then `optind' is the index in ARGV of the first ARGV-element + that is not an option. (The ARGV-elements have been permuted + so that those that are not options now come last.) + + OPTSTRING is a string containing the legitimate option characters. + If an option character is seen that is not listed in OPTSTRING, + return '?' after printing an error message. If you set `opterr' to + zero, the error message is suppressed but we still return '?'. + + If a char in OPTSTRING is followed by a colon, that means it wants an arg, + so the following text in the same ARGV-element, or the text of the following + ARGV-element, is returned in `optarg'. Two colons mean an option that + wants an optional arg; if there is text in the current ARGV-element, + it is returned in `optarg', otherwise `optarg' is set to zero. + + If OPTSTRING starts with `-' or `+', it requests different methods of + handling the non-option ARGV-elements. + See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. + + Long-named options begin with `--' instead of `-'. + Their names may be abbreviated as long as the abbreviation is unique + or is an exact match for some defined option. If they have an + argument, it follows the option name in the same ARGV-element, separated + from the option name by a `=', or else the in next ARGV-element. + When `getopt' finds a long-named option, it returns 0 if that option's + `flag' field is nonzero, the value of the option's `val' field + if the `flag' field is zero. + + The elements of ARGV aren't really const, because we permute them. + But we pretend they're const in the prototype to be compatible + with other systems. + + LONGOPTS is a vector of `struct option' terminated by an + element containing a name which is zero. + + LONGIND returns the index in LONGOPT of the long-named option found. + It is only valid when a long-named option has been found by the most + recent call. + + If LONG_ONLY is nonzero, '-' as well as '--' can introduce + long-named options. */ + +#if 0 +int +_getopt_internal (argc, argv, optstring, longopts, longind, long_only) + int argc; + char *const *argv; + const char *optstring; + const struct option *longopts; + int *longind; + int long_only; +#endif +int +_getopt_internal (int argc, + char *const *argv, + const char *optstring, + const struct option *longopts, + int *longind, + int long_only) +{ + optarg = NULL; + + if (optind == 0 || !__getopt_initialized) + { + if (optind == 0) + optind = 1; /* Don't scan ARGV[0], the program name. */ + optstring = _getopt_initialize (argc, argv, optstring); + __getopt_initialized = 1; + } + + /* Test whether ARGV[optind] points to a non-option argument. + Either it does not have option syntax, or there is an environment flag + from the shell indicating it is not an option. The later information + is only used when the used in the GNU libc. */ +#ifdef _LIBC +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ + || (optind < nonoption_flags_len \ + && __getopt_nonoption_flags[optind] == '1')) +#else +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') +#endif + + if (nextchar == NULL || *nextchar == '\0') + { + /* Advance to the next ARGV-element. */ + + /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been + moved back by the user (who may also have changed the arguments). */ + if (last_nonopt > optind) + last_nonopt = optind; + if (first_nonopt > optind) + first_nonopt = optind; + + if (ordering == PERMUTE) + { + /* If we have just processed some options following some non-options, + exchange them so that the options come first. */ + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (last_nonopt != optind) + first_nonopt = optind; + + /* Skip any additional non-options + and extend the range of non-options previously skipped. */ + + while (optind < argc && NONOPTION_P) + optind++; + last_nonopt = optind; + } + + /* The special ARGV-element `--' means premature end of options. + Skip it like a null option, + then exchange with previous non-options as if it were an option, + then skip everything else like a non-option. */ + + if (optind != argc && !strcmp (argv[optind], "--")) + { + optind++; + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (first_nonopt == last_nonopt) + first_nonopt = optind; + last_nonopt = argc; + + optind = argc; + } + + /* If we have done all the ARGV-elements, stop the scan + and back over any non-options that we skipped and permuted. */ + + if (optind == argc) + { + /* Set the next-arg-index to point at the non-options + that we previously skipped, so the caller will digest them. */ + if (first_nonopt != last_nonopt) + optind = first_nonopt; + return -1; + } + + /* If we have come to a non-option and did not permute it, + either stop the scan or describe it to the caller and pass it by. */ + + if (NONOPTION_P) + { + if (ordering == REQUIRE_ORDER) + return -1; + optarg = argv[optind++]; + return 1; + } + + /* We have found another option-ARGV-element. + Skip the initial punctuation. */ + + nextchar = (argv[optind] + 1 + + (longopts != NULL && argv[optind][1] == '-')); + } + + /* Decode the current option-ARGV-element. */ + + /* Check whether the ARGV-element is a long option. + + If long_only and the ARGV-element has the form "-f", where f is + a valid short option, don't consider it an abbreviated form of + a long option that starts with f. Otherwise there would be no + way to give the -f short option. + + On the other hand, if there's a long option "fubar" and + the ARGV-element is "-fu", do consider that an abbreviation of + the long option, just like "--fu", and not "-f" with arg "u". + + This distinction seems to be the most useful approach. */ + + if (longopts != NULL + && (argv[optind][1] == '-' + || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1]))))) + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = -1; + int option_index; + + for (nameend = nextchar; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) + == (unsigned int) strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + + if (ambig && !exact) + { + if (opterr) + fprintf (stderr, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]); + nextchar += strlen (nextchar); + optind++; + optopt = 0; + return '?'; + } + + if (pfound != NULL) + { + option_index = indfound; + optind++; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (opterr) + { + if (argv[optind - 1][1] == '-') + /* --option */ + fprintf (stderr, + _("%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); + else + /* +option or -option */ + fprintf (stderr, + _("%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], pfound->name); + + nextchar += strlen (nextchar); + + optopt = pfound->val; + return '?'; + } + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (opterr) + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); + nextchar += strlen (nextchar); + optopt = pfound->val; + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + + /* Can't find it as a long option. If this is not getopt_long_only, + or the option starts with '--' or is not a valid short + option, then it's an error. + Otherwise interpret it as a short option. */ + if (!long_only || argv[optind][1] == '-' + || my_index (optstring, *nextchar) == NULL) + { + if (opterr) + { + if (argv[optind][1] == '-') + /* --option */ + fprintf (stderr, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); + else + /* +option or -option */ + fprintf (stderr, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); + } + nextchar = (char *) ""; + optind++; + optopt = 0; + return '?'; + } + } + + /* Look at and handle the next short option-character. */ + + { + char c = *nextchar++; + char *temp = my_index (optstring, c); + + /* Increment `optind' when we start to process its last character. */ + if (*nextchar == '\0') + ++optind; + + if (temp == NULL || c == ':') + { + if (opterr) + { + if (posixly_correct) + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, _("%s: illegal option -- %c\n"), + argv[0], c); + else + fprintf (stderr, _("%s: invalid option -- %c\n"), + argv[0], c); + } + optopt = c; + return '?'; + } + /* Convenience. Treat POSIX -W foo same as long option --foo */ + if (temp[0] == 'W' && temp[1] == ';') + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = 0; + int option_index; + + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (opterr) + { + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, _("%s: option requires an argument -- %c\n"), + argv[0], c); + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + return c; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + + /* optarg is now the argument, see if it's in the + table of longopts. */ + + for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) == strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + if (ambig && !exact) + { + if (opterr) + fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]); + nextchar += strlen (nextchar); + optind++; + return '?'; + } + if (pfound != NULL) + { + option_index = indfound; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (opterr) + fprintf (stderr, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name); + + nextchar += strlen (nextchar); + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (opterr) + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); + nextchar += strlen (nextchar); + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + nextchar = NULL; + return 'W'; /* Let the application handle it. */ + } + if (temp[1] == ':') + { + if (temp[2] == ':') + { + /* This is an option that accepts an argument optionally. */ + if (*nextchar != '\0') + { + optarg = nextchar; + optind++; + } + else + optarg = NULL; + nextchar = NULL; + } + else + { + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (opterr) + { + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, + _("%s: option requires an argument -- %c\n"), + argv[0], c); + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + nextchar = NULL; + } + } + return c; + } +} + +/* +int +getopt (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + return _getopt_internal (argc, argv, optstring, + (const struct option *) 0, + (int *) 0, + 0); +} +*/ + +#endif /* Not ELIDE_CODE. */ +/* ******************** ...getopt.c ******************** */ + + + +/* Comment out all this code if we are using the GNU C Library, and are not + actually compiling the library itself. This code is part of the GNU C + Library, but also included in many other GNU distributions. Compiling + and linking in this code is a waste when using the GNU C library + (especially if it is a shared library). Rather than having every GNU + program understand `configure --with-gnu-libc' and omit the object files, + it is simpler to just do this in the source for each such file. */ + +#define GETOPT_INTERFACE_VERSION 2 +#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 +#include +#if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION +#define ELIDE_CODE +#endif +#endif + +#ifndef ELIDE_CODE + + +/* This needs to come after some library #include + to get __GNU_LIBRARY__ defined. */ +#ifdef __GNU_LIBRARY__ +#include +#endif + +#ifndef NULL +#define NULL 0 +#endif + +/* K&R declarations!? C'mon... */ +/* Just say no to all this gymnastics */ +#if 0 +int +getopt_long (argc, argv, options, long_options, opt_index) + int argc; + char *const *argv; + const char *options; + const struct option *long_options; + int *opt_index; +#endif +int getopt_long (int argc, + char *const *argv, + const char *options, + const struct option *long_options, + int *opt_index) +{ + return _getopt_internal (argc, argv, options, long_options, opt_index, 0); +} + +/* Like getopt_long, but '-' as well as '--' can indicate a long option. + If an option that starts with '-' (not '--') doesn't match a long option, + but does match a short option, it is parsed as a short option + instead. */ + +#if 0 +int +getopt_long_only (argc, argv, options, long_options, opt_index) + int argc; + char *const *argv; + const char *options; + const struct option *long_options; + int *opt_index; +#endif +int +getopt_long_only (int argc, + char *const *argv, + const char *options, + const struct option *long_options, + int *opt_index) +{ + return _getopt_internal (argc, argv, options, long_options, opt_index, 1); +} + + +#endif /* Not ELIDE_CODE. */ + +#ifdef TEST + +#include + +int +main (argc, argv) + int argc; + char **argv; +{ + int c; + int digit_optind = 0; + + while (1) + { + int this_option_optind = optind ? optind : 1; + int option_index = 0; + static struct option long_options[] = + { + {"add", 1, 0, 0}, + {"append", 0, 0, 0}, + {"delete", 1, 0, 0}, + {"verbose", 0, 0, 0}, + {"create", 0, 0, 0}, + {"file", 1, 0, 0}, + {0, 0, 0, 0} + }; + + c = getopt_long (argc, argv, "abc:d:0123456789", + long_options, &option_index); + if (c == -1) + break; + + switch (c) + { + case 0: + printf ("option %s", long_options[option_index].name); + if (optarg) + printf (" with arg %s", optarg); + printf ("\n"); + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (digit_optind != 0 && digit_optind != this_option_optind) + printf ("digits occur in two different argv-elements.\n"); + digit_optind = this_option_optind; + printf ("option %c\n", c); + break; + + case 'a': + printf ("option a\n"); + break; + + case 'b': + printf ("option b\n"); + break; + + case 'c': + printf ("option c with value `%s'\n", optarg); + break; + + case 'd': + printf ("option d with value `%s'\n", optarg); + break; + + case '?': + break; + + default: + printf ("?? getopt returned character code 0%o ??\n", c); + } + } + + if (optind < argc) + { + printf ("non-option ARGV-elements: "); + while (optind < argc) + printf ("%s ", argv[optind++]); + printf ("\n"); + } + + exit (0); +} + +#endif /* TEST */ + +#endif /* #ifndef HAVE_GETOPT_LONG */ Index: branches/apertium-tagger/apertium2/apertium/getopt_long.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/getopt_long.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/getopt_long.h (revision 69632) @@ -0,0 +1,175 @@ +/* Declarations for getopt. + Copyright 1989, 1990, 1991, 1992, 1993, 1994, 1996, 1997, 1998, 2000 + Free Software Foundation, Inc. + + NOTE: The canonical source of this file is maintained with the GNU C Library. + Bugs can be reported to bug-glibc@gnu.org. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + USA. */ + +#ifndef _GETOPT_LONG_H +#define _GETOPT_LONG_H 1 + +#include + +#if HAVE_UNISTD_H +/* Declares getopt, if present */ +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* We're building this with a C++ compiler, essentially. Such + compilers are not required to define __STDC__, but the path we + should follow, below, is indeed that marked by __STDC__. We don't + want to force a definition of __STDC__ (though that works), because + (a) that feels bad, and (b) some compilers perfectly reasonable + complain bitterly about it. So define THIS_IS__STDC__, and replace + occurrences of __STDC__ throughout with that. + + That means that all of the occurrences of THIS_IS__STDC__ in this + file and in getopt_long.c are redundant, but I'm leaving them here + in case it becomes necessary to do cleverer things with it than + simply define it to be 1, and also as a sort of warped documentation. */ +#define THIS_IS__STDC__ 1 + +#if !HAVE_DECL_GETOPT +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +extern char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +extern int optind; + +/* Callers store zero here to inhibit the error message `getopt' prints + for unrecognized options. */ + +extern int opterr; + +/* Set to an option character which was unrecognized. */ + +extern int optopt; + +#endif /* ifndef HAVE_DECL_GETOPT */ + +#if !HAVE_DECL_GETOPT_LONG +/* Describe the long-named options requested by the application. + The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector + of `struct option' terminated by an element containing a name which is + zero. + + The field `has_arg' is: + no_argument (or 0) if the option does not take an argument, + required_argument (or 1) if the option requires an argument, + optional_argument (or 2) if the option takes an optional argument. + + If the field `flag' is not NULL, it points to a variable that is set + to the value given in the field `val' when the option is found, but + left unchanged if the option is not found. + + To have a long-named option do something other than set an `int' to + a compiled-in constant, such as set a value from `optarg', set the + option's `flag' field to zero and its `val' field to a nonzero + value (the equivalent single-letter option character, if there is + one). For long options that have a zero `flag' field, `getopt' + returns the contents of the `val' field. */ + +struct option +{ +#if defined (THIS_IS__STDC__) && THIS_IS__STDC__ + const char *name; +#else + char *name; +#endif + /* has_arg can't be an enum because some compilers complain about + type mismatches in all the code that assumes it is an int. */ + int has_arg; + int *flag; + int val; +}; + +/* Names for the values of the `has_arg' field of `struct option'. */ + +#define no_argument 0 +#define required_argument 1 +#define optional_argument 2 + +#endif /* #if !HAVE_DECL_GETOPT_LONG */ + +#if defined (THIS_IS__STDC__) && THIS_IS__STDC__ +/* HAVE_DECL_* is a three-state macro: undefined, 0 or 1. If it is + undefined, we haven't run the autoconf check so provide the + declaration without arguments. If it is 0, we checked and failed + to find the declaration so provide a fully prototyped one. If it + is 1, we found it so don't provide any declaration at all. */ +#if defined (__GNU_LIBRARY__) || (defined (HAVE_DECL_GETOPT) && !HAVE_DECL_GETOPT) +/* Many other libraries have conflicting prototypes for getopt, with + differences in the consts, in stdlib.h. To avoid compilation + errors, only prototype getopt for the GNU C library. */ +extern int getopt (int argc, char *const *argv, const char *shortopts); +#else /* not __GNU_LIBRARY__ */ +# if !defined (HAVE_DECL_GETOPT) +extern int getopt (); +# endif +#endif /* __GNU_LIBRARY__ */ +#if !HAVE_DECL_GETOPT_LONG +extern int getopt_long (int argc, char *const *argv, const char *shortopts, + const struct option *longopts, int *longind); +extern int getopt_long_only (int argc, char *const *argv, + const char *shortopts, + const struct option *longopts, int *longind); + +/* Internal only. Users should not call this directly. */ +extern int _getopt_internal (int argc, char *const *argv, + const char *shortopts, + const struct option *longopts, int *longind, + int long_only); +#endif /* HAVE_DECL_GETOPT_LONG */ +#else /* not THIS_IS__STDC__ */ +#if !HAVE_DECL_GETOPT +extern int getopt (); +#endif /* HAVE_DECL_GETOPT */ +#if !HAVE_DECL_GETOPT_LONG +extern int getopt_long (); +extern int getopt_long_only (); + +extern int _getopt_internal (); +#endif /* HAVE_DECL_GETOPT_LONG */ +#endif /* THIS_IS__STDC__ */ + + +#ifdef __cplusplus +} +#endif + +#endif /* getopt.h */ Index: branches/apertium-tagger/apertium2/apertium/win32/unistd.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/unistd.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/unistd.h (revision 69632) @@ -0,0 +1,13 @@ +// This should really be defined elsewhere +#define YY_INPUT(buf,result,max_size) \ + if ( (result = fread( (char *) buf, 1, max_size, yyin )) < 0 ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); + +#define fileno _fileno + +#if defined(_WIN32) && defined(isatty) +#undef isatty +#define isatty _isatty +#endif + +#define unlink _unlink Index: branches/apertium-tagger/apertium2/apertium/win32/snprintf.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/snprintf.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/snprintf.c (revision 69632) @@ -0,0 +1,1025 @@ +/* + * snprintf.c - a portable implementation of snprintf + * + * AUTHOR + * Mark Martinec , April 1999. + * + * Copyright 1999, Mark Martinec. All rights reserved. + * + * TERMS AND CONDITIONS + * This program is free software; you can redistribute it and/or modify + * it under the terms of the "Frontier Artistic License" which comes + * with this Kit. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the Frontier Artistic License for more details. + * + * You should have received a copy of the Frontier Artistic License + * with this Kit in the file named LICENSE.txt . + * If not, I'll be glad to provide one. + * + * FEATURES + * - careful adherence to specs regarding flags, field width and precision; + * - good performance for large string handling (large format, large + * argument or large paddings). Performance is similar to system's sprintf + * and in several cases significantly better (make sure you compile with + * optimizations turned on, tell the compiler the code is strict ANSI + * if necessary to give it more freedom for optimizations); + * - return value semantics per ISO/IEC 9899:1999 ("ISO C99"); + * - written in standard ISO/ANSI C - requires an ANSI C compiler. + * + * SUPPORTED CONVERSION SPECIFIERS AND DATA TYPES + * + * This snprintf only supports the following conversion specifiers: + * s, c, d, u, o, x, X, p (and synonyms: i, D, U, O - see below) + * with flags: '-', '+', ' ', '0' and '#'. + * An asterisk is supported for field width as well as precision. + * + * Length modifiers 'h' (short int), 'l' (long int), + * and 'll' (long long int) are supported. + * NOTE: + * If macro SNPRINTF_LONGLONG_SUPPORT is not defined (default) the + * length modifier 'll' is recognized but treated the same as 'l', + * which may cause argument value truncation! Defining + * SNPRINTF_LONGLONG_SUPPORT requires that your system's sprintf also + * handles length modifier 'll'. long long int is a language extension + * which may not be portable. + * + * Conversion of numeric data (conversion specifiers d, u, o, x, X, p) + * with length modifiers (none or h, l, ll) is left to the system routine + * sprintf, but all handling of flags, field width and precision as well as + * c and s conversions is done very carefully by this portable routine. + * If a string precision (truncation) is specified (e.g. %.8s) it is + * guaranteed the string beyond the specified precision will not be referenced. + * + * Length modifiers h, l and ll are ignored for c and s conversions (data + * types wint_t and wchar_t are not supported). + * + * The following common synonyms for conversion characters are supported: + * - i is a synonym for d + * - D is a synonym for ld, explicit length modifiers are ignored + * - U is a synonym for lu, explicit length modifiers are ignored + * - O is a synonym for lo, explicit length modifiers are ignored + * The D, O and U conversion characters are nonstandard, they are supported + * for backward compatibility only, and should not be used for new code. + * + * The following is specifically NOT supported: + * - flag ' (thousands' grouping character) is recognized but ignored + * - numeric conversion specifiers: f, e, E, g, G and synonym F, + * as well as the new a and A conversion specifiers + * - length modifier 'L' (long double) and 'q' (quad - use 'll' instead) + * - wide character/string conversions: lc, ls, and nonstandard + * synonyms C and S + * - writeback of converted string length: conversion character n + * - the n$ specification for direct reference to n-th argument + * - locales + * + * It is permitted for str_m to be zero, and it is permitted to specify NULL + * pointer for resulting string argument if str_m is zero (as per ISO C99). + * + * The return value is the number of characters which would be generated + * for the given input, excluding the trailing null. If this value + * is greater or equal to str_m, not all characters from the result + * have been stored in str, output bytes beyond the (str_m-1) -th character + * are discarded. If str_m is greater than zero it is guaranteed + * the resulting string will be null-terminated. + * + * NOTE that this matches the ISO C99, OpenBSD, and GNU C library 2.1, + * but is different from some older and vendor implementations, + * and is also different from XPG, XSH5, SUSv2 specifications. + * For historical discussion on changes in the semantics and standards + * of snprintf see printf(3) man page in the Linux programmers manual. + * + * Routines asprintf and vasprintf return a pointer (in the ptr argument) + * to a buffer sufficiently large to hold the resulting string. This pointer + * should be passed to free(3) to release the allocated storage when it is + * no longer needed. If sufficient space cannot be allocated, these functions + * will return -1 and set ptr to be a NULL pointer. These two routines are a + * GNU C library extensions (glibc). + * + * Routines asnprintf and vasnprintf are similar to asprintf and vasprintf, + * yet, like snprintf and vsnprintf counterparts, will write at most str_m-1 + * characters into the allocated output string, the last character in the + * allocated buffer then gets the terminating null. If the formatted string + * length (the return value) is greater than or equal to the str_m argument, + * the resulting string was truncated and some of the formatted characters + * were discarded. These routines present a handy way to limit the amount + * of allocated memory to some sane value. + * + * AVAILABILITY + * http://www.ijs.si/software/snprintf/ + * + * REVISION HISTORY + * 1999-04 V0.9 Mark Martinec + * - initial version, some modifications after comparing printf + * man pages for Digital Unix 4.0, Solaris 2.6 and HPUX 10, + * and checking how Perl handles sprintf (differently!); + * 1999-04-09 V1.0 Mark Martinec + * - added main test program, fixed remaining inconsistencies, + * added optional (long long int) support; + * 1999-04-12 V1.1 Mark Martinec + * - support the 'p' conversion (pointer to void); + * - if a string precision is specified + * make sure the string beyond the specified precision + * will not be referenced (e.g. by strlen); + * 1999-04-13 V1.2 Mark Martinec + * - support synonyms %D=%ld, %U=%lu, %O=%lo; + * - speed up the case of long format string with few conversions; + * 1999-06-30 V1.3 Mark Martinec + * - fixed runaway loop (eventually crashing when str_l wraps + * beyond 2^31) while copying format string without + * conversion specifiers to a buffer that is too short + * (thanks to Edwin Young for + * spotting the problem); + * - added macros PORTABLE_SNPRINTF_VERSION_(MAJOR|MINOR) + * to snprintf.h + * 2000-02-14 V2.0 (never released) Mark Martinec + * - relaxed license terms: The Artistic License now applies. + * You may still apply the GNU GENERAL PUBLIC LICENSE + * as was distributed with previous versions, if you prefer; + * - changed REVISION HISTORY dates to use ISO 8601 date format; + * - added vsnprintf (patch also independently proposed by + * Caolan McNamara 2000-05-04, and Keith M Willenson 2000-06-01) + * 2000-06-27 V2.1 Mark Martinec + * - removed POSIX check for str_m<1; value 0 for str_m is + * allowed by ISO C99 (and GNU C library 2.1) - (pointed out + * on 2000-05-04 by Caolan McNamara, caolan@ csn dot ul dot ie). + * Besides relaxed license this change in standards adherence + * is the main reason to bump up the major version number; + * - added nonstandard routines asnprintf, vasnprintf, asprintf, + * vasprintf that dynamically allocate storage for the + * resulting string; these routines are not compiled by default, + * see comments where NEED_V?ASN?PRINTF macros are defined; + * - autoconf contributed by Caolan McNamara + * 2000-10-06 V2.2 Mark Martinec + * - BUG FIX: the %c conversion used a temporary variable + * that was no longer in scope when referenced, + * possibly causing incorrect resulting character; + * - BUG FIX: make precision and minimal field width unsigned + * to handle huge values (2^31 <= n < 2^32) correctly; + * also be more careful in the use of signed/unsigned/size_t + * internal variables - probably more careful than many + * vendor implementations, but there may still be a case + * where huge values of str_m, precision or minimal field + * could cause incorrect behaviour; + * - use separate variables for signed/unsigned arguments, + * and for short/int, long, and long long argument lengths + * to avoid possible incompatibilities on certain + * computer architectures. Also use separate variable + * arg_sign to hold sign of a numeric argument, + * to make code more transparent; + * - some fiddling with zero padding and "0x" to make it + * Linux compatible; + * - systematically use macros fast_memcpy and fast_memset + * instead of case-by-case hand optimization; determine some + * breakeven string lengths for different architectures; + * - terminology change: 'format' -> 'conversion specifier', + * 'C9x' -> 'ISO/IEC 9899:1999 ("ISO C99")', + * 'alternative form' -> 'alternate form', + * 'data type modifier' -> 'length modifier'; + * - several comments rephrased and new ones added; + * - make compiler not complain about 'credits' defined but + * not used; + */ + + +/* Define HAVE_SNPRINTF if your system already has snprintf and vsnprintf. + * + * If HAVE_SNPRINTF is defined this module will not produce code for + * snprintf and vsnprintf, unless PREFER_PORTABLE_SNPRINTF is defined as well, + * causing this portable version of snprintf to be called portable_snprintf + * (and portable_vsnprintf). + */ +/* #define HAVE_SNPRINTF */ + +/* Define PREFER_PORTABLE_SNPRINTF if your system does have snprintf and + * vsnprintf but you would prefer to use the portable routine(s) instead. + * In this case the portable routine is declared as portable_snprintf + * (and portable_vsnprintf) and a macro 'snprintf' (and 'vsnprintf') + * is defined to expand to 'portable_v?snprintf' - see file snprintf.h . + * Defining this macro is only useful if HAVE_SNPRINTF is also defined, + * but does does no harm if defined nevertheless. + */ +/* #define PREFER_PORTABLE_SNPRINTF */ + +/* Define SNPRINTF_LONGLONG_SUPPORT if you want to support + * data type (long long int) and length modifier 'll' (e.g. %lld). + * If undefined, 'll' is recognized but treated as a single 'l'. + * + * If the system's sprintf does not handle 'll' + * the SNPRINTF_LONGLONG_SUPPORT must not be defined! + * + * This is off by default as (long long int) is a language extension. + */ +/* #define SNPRINTF_LONGLONG_SUPPORT */ + +/* Define NEED_SNPRINTF_ONLY if you only need snprintf, and not vsnprintf. + * If NEED_SNPRINTF_ONLY is defined, the snprintf will be defined directly, + * otherwise both snprintf and vsnprintf routines will be defined + * and snprintf will be a simple wrapper around vsnprintf, at the expense + * of an extra procedure call. + */ +/* #define NEED_SNPRINTF_ONLY */ + +/* Define NEED_V?ASN?PRINTF macros if you need library extension + * routines asprintf, vasprintf, asnprintf, vasnprintf respectively, + * and your system library does not provide them. They are all small + * wrapper routines around portable_vsnprintf. Defining any of the four + * NEED_V?ASN?PRINTF macros automatically turns off NEED_SNPRINTF_ONLY + * and turns on PREFER_PORTABLE_SNPRINTF. + * + * Watch for name conflicts with the system library if these routines + * are already present there. + * + * NOTE: vasprintf and vasnprintf routines need va_copy() from stdarg.h, as + * specified by C99, to be able to traverse the same list of arguments twice. + * I don't know of any other standard and portable way of achieving the same. + * With some versions of gcc you may use __va_copy(). You might even get away + * with "ap2 = ap", in this case you must not call va_end(ap2) ! + * #define va_copy(ap2,ap) ap2 = ap + */ +/* #define NEED_ASPRINTF */ +/* #define NEED_ASNPRINTF */ +/* #define NEED_VASPRINTF */ +/* #define NEED_VASNPRINTF */ + + +/* Define the following macros if desired: + * SOLARIS_COMPATIBLE, SOLARIS_BUG_COMPATIBLE, + * HPUX_COMPATIBLE, HPUX_BUG_COMPATIBLE, LINUX_COMPATIBLE, + * DIGITAL_UNIX_COMPATIBLE, DIGITAL_UNIX_BUG_COMPATIBLE, + * PERL_COMPATIBLE, PERL_BUG_COMPATIBLE, + * + * - For portable applications it is best not to rely on peculiarities + * of a given implementation so it may be best not to define any + * of the macros that select compatibility and to avoid features + * that vary among the systems. + * + * - Selecting compatibility with more than one operating system + * is not strictly forbidden but is not recommended. + * + * - 'x'_BUG_COMPATIBLE implies 'x'_COMPATIBLE . + * + * - 'x'_COMPATIBLE refers to (and enables) a behaviour that is + * documented in a sprintf man page on a given operating system + * and actually adhered to by the system's sprintf (but not on + * most other operating systems). It may also refer to and enable + * a behaviour that is declared 'undefined' or 'implementation specific' + * in the man page but a given implementation behaves predictably + * in a certain way. + * + * - 'x'_BUG_COMPATIBLE refers to (and enables) a behaviour of system's sprintf + * that contradicts the sprintf man page on the same operating system. + * + * - I do not claim that the 'x'_COMPATIBLE and 'x'_BUG_COMPATIBLE + * conditionals take into account all idiosyncrasies of a particular + * implementation, there may be other incompatibilities. + */ + + + +/* ============================================= */ +/* NO USER SERVICABLE PARTS FOLLOWING THIS POINT */ +/* ============================================= */ + +#define PORTABLE_SNPRINTF_VERSION_MAJOR 2 +#define PORTABLE_SNPRINTF_VERSION_MINOR 2 + +#if defined(NEED_ASPRINTF) || defined(NEED_ASNPRINTF) || defined(NEED_VASPRINTF) || defined(NEED_VASNPRINTF) +# if defined(NEED_SNPRINTF_ONLY) +# undef NEED_SNPRINTF_ONLY +# endif +# if !defined(PREFER_PORTABLE_SNPRINTF) +# define PREFER_PORTABLE_SNPRINTF +# endif +#endif + +#if defined(SOLARIS_BUG_COMPATIBLE) && !defined(SOLARIS_COMPATIBLE) +#define SOLARIS_COMPATIBLE +#endif + +#if defined(HPUX_BUG_COMPATIBLE) && !defined(HPUX_COMPATIBLE) +#define HPUX_COMPATIBLE +#endif + +#if defined(DIGITAL_UNIX_BUG_COMPATIBLE) && !defined(DIGITAL_UNIX_COMPATIBLE) +#define DIGITAL_UNIX_COMPATIBLE +#endif + +#if defined(PERL_BUG_COMPATIBLE) && !defined(PERL_COMPATIBLE) +#define PERL_COMPATIBLE +#endif + +#if defined(LINUX_BUG_COMPATIBLE) && !defined(LINUX_COMPATIBLE) +#define LINUX_COMPATIBLE +#endif + +#include +#include +#include +#include +#include +#include +#include + +#ifdef isdigit +#undef isdigit +#endif +#define isdigit(c) ((c) >= '0' && (c) <= '9') + +/* For copying strings longer or equal to 'breakeven_point' + * it is more efficient to call memcpy() than to do it inline. + * The value depends mostly on the processor architecture, + * but also on the compiler and its optimization capabilities. + * The value is not critical, some small value greater than zero + * will be just fine if you don't care to squeeze every drop + * of performance out of the code. + * + * Small values favor memcpy, large values favor inline code. + */ +#if defined(__alpha__) || defined(__alpha) +# define breakeven_point 2 /* AXP (DEC Alpha) - gcc or cc or egcs */ +#endif +#if defined(__i386__) || defined(__i386) +# define breakeven_point 12 /* Intel Pentium/Linux - gcc 2.96 */ +#endif +#if defined(__hppa) +# define breakeven_point 10 /* HP-PA - gcc */ +#endif +#if defined(__sparc__) || defined(__sparc) +# define breakeven_point 33 /* Sun Sparc 5 - gcc 2.8.1 */ +#endif + +/* some other values of possible interest: */ +/* #define breakeven_point 8 */ /* VAX 4000 - vaxc */ +/* #define breakeven_point 19 */ /* VAX 4000 - gcc 2.7.0 */ + +#ifndef breakeven_point +# define breakeven_point 6 /* some reasonable one-size-fits-all value */ +#endif + +#define fast_memcpy(d,s,n) \ + { register size_t nn = (size_t)(n); \ + if (nn >= breakeven_point) memcpy((d), (s), nn); \ + else if (nn > 0) { /* proc call overhead is worth only for large strings*/\ + register char *dd; register const char *ss; \ + for (ss=(s), dd=(d); nn>0; nn--) *dd++ = *ss++; } } + +#define fast_memset(d,c,n) \ + { register size_t nn = (size_t)(n); \ + if (nn >= breakeven_point) memset((d), (int)(c), nn); \ + else if (nn > 0) { /* proc call overhead is worth only for large strings*/\ + register char *dd; register const int cc=(int)(c); \ + for (dd=(d); nn>0; nn--) *dd++ = cc; } } + +/* prototypes */ + +#if defined(NEED_ASPRINTF) +int asprintf (char **ptr, const char *fmt, /*args*/ ...); +#endif +#if defined(NEED_VASPRINTF) +int vasprintf (char **ptr, const char *fmt, va_list ap); +#endif +#if defined(NEED_ASNPRINTF) +int asnprintf (char **ptr, size_t str_m, const char *fmt, /*args*/ ...); +#endif +#if defined(NEED_VASNPRINTF) +int vasnprintf (char **ptr, size_t str_m, const char *fmt, va_list ap); +#endif + +#if defined(HAVE_SNPRINTF) +/* declare our portable snprintf routine under name portable_snprintf */ +/* declare our portable vsnprintf routine under name portable_vsnprintf */ +#else +/* declare our portable routines under names snprintf and vsnprintf */ +#define portable_snprintf snprintf +#if !defined(NEED_SNPRINTF_ONLY) +#define portable_vsnprintf vsnprintf +#endif +#endif + +#if !defined(HAVE_SNPRINTF) || defined(PREFER_PORTABLE_SNPRINTF) +int portable_snprintf(char *str, size_t str_m, const char *fmt, /*args*/ ...); +#if !defined(NEED_SNPRINTF_ONLY) +int portable_vsnprintf(char *str, size_t str_m, const char *fmt, va_list ap); +#endif +#endif + +/* declarations */ + +static char credits[] = "\n\ +@(#)snprintf.c, v2.2: Mark Martinec, \n\ +@(#)snprintf.c, v2.2: Copyright 1999, Mark Martinec. Frontier Artistic License applies.\n\ +@(#)snprintf.c, v2.2: http://www.ijs.si/software/snprintf/\n"; + +#if defined(NEED_ASPRINTF) +int asprintf(char **ptr, const char *fmt, /*args*/ ...) { + va_list ap; + size_t str_m; + int str_l; + + *ptr = NULL; + va_start(ap, fmt); /* measure the required size */ + str_l = portable_vsnprintf(NULL, (size_t)0, fmt, ap); + va_end(ap); + assert(str_l >= 0); /* possible integer overflow if str_m > INT_MAX */ + *ptr = (char *) malloc(str_m = (size_t)str_l + 1); + if (*ptr == NULL) { errno = ENOMEM; str_l = -1; } + else { + int str_l2; + va_start(ap, fmt); + str_l2 = portable_vsnprintf(*ptr, str_m, fmt, ap); + va_end(ap); + assert(str_l2 == str_l); + } + return str_l; +} +#endif + +#if defined(NEED_VASPRINTF) +int vasprintf(char **ptr, const char *fmt, va_list ap) { + size_t str_m; + int str_l; + + *ptr = NULL; + { va_list ap2; + va_copy(ap2, ap); /* don't consume the original ap, we'll need it again */ + str_l = portable_vsnprintf(NULL, (size_t)0, fmt, ap2);/*get required size*/ + va_end(ap2); + } + assert(str_l >= 0); /* possible integer overflow if str_m > INT_MAX */ + *ptr = (char *) malloc(str_m = (size_t)str_l + 1); + if (*ptr == NULL) { errno = ENOMEM; str_l = -1; } + else { + int str_l2 = portable_vsnprintf(*ptr, str_m, fmt, ap); + assert(str_l2 == str_l); + } + return str_l; +} +#endif + +#if defined(NEED_ASNPRINTF) +int asnprintf (char **ptr, size_t str_m, const char *fmt, /*args*/ ...) { + va_list ap; + int str_l; + + *ptr = NULL; + va_start(ap, fmt); /* measure the required size */ + str_l = portable_vsnprintf(NULL, (size_t)0, fmt, ap); + va_end(ap); + assert(str_l >= 0); /* possible integer overflow if str_m > INT_MAX */ + if ((size_t)str_l + 1 < str_m) str_m = (size_t)str_l + 1; /* truncate */ + /* if str_m is 0, no buffer is allocated, just set *ptr to NULL */ + if (str_m == 0) { /* not interested in resulting string, just return size */ + } else { + *ptr = (char *) malloc(str_m); + if (*ptr == NULL) { errno = ENOMEM; str_l = -1; } + else { + int str_l2; + va_start(ap, fmt); + str_l2 = portable_vsnprintf(*ptr, str_m, fmt, ap); + va_end(ap); + assert(str_l2 == str_l); + } + } + return str_l; +} +#endif + +#if defined(NEED_VASNPRINTF) +int vasnprintf (char **ptr, size_t str_m, const char *fmt, va_list ap) { + int str_l; + + *ptr = NULL; + { va_list ap2; + va_copy(ap2, ap); /* don't consume the original ap, we'll need it again */ + str_l = portable_vsnprintf(NULL, (size_t)0, fmt, ap2);/*get required size*/ + va_end(ap2); + } + assert(str_l >= 0); /* possible integer overflow if str_m > INT_MAX */ + if ((size_t)str_l + 1 < str_m) str_m = (size_t)str_l + 1; /* truncate */ + /* if str_m is 0, no buffer is allocated, just set *ptr to NULL */ + if (str_m == 0) { /* not interested in resulting string, just return size */ + } else { + *ptr = (char *) malloc(str_m); + if (*ptr == NULL) { errno = ENOMEM; str_l = -1; } + else { + int str_l2 = portable_vsnprintf(*ptr, str_m, fmt, ap); + assert(str_l2 == str_l); + } + } + return str_l; +} +#endif + +/* + * If the system does have snprintf and the portable routine is not + * specifically required, this module produces no code for snprintf/vsnprintf. + */ +#if !defined(HAVE_SNPRINTF) || defined(PREFER_PORTABLE_SNPRINTF) + +#if !defined(NEED_SNPRINTF_ONLY) +int portable_snprintf(char *str, size_t str_m, const char *fmt, /*args*/ ...) { + va_list ap; + int str_l; + + va_start(ap, fmt); + str_l = portable_vsnprintf(str, str_m, fmt, ap); + va_end(ap); + return str_l; +} +#endif + +#if defined(NEED_SNPRINTF_ONLY) +int portable_snprintf(char *str, size_t str_m, const char *fmt, /*args*/ ...) { +#else +int portable_vsnprintf(char *str, size_t str_m, const char *fmt, va_list ap) { +#endif + +#if defined(NEED_SNPRINTF_ONLY) + va_list ap; +#endif + size_t str_l = 0; + const char *p = fmt; + +/* In contrast with POSIX, the ISO C99 now says + * that str can be NULL and str_m can be 0. + * This is more useful than the old: if (str_m < 1) return -1; */ + +#if defined(NEED_SNPRINTF_ONLY) + va_start(ap, fmt); +#endif + if (!p) p = ""; + while (*p) { + if (*p != '%') { + /* if (str_l < str_m) str[str_l++] = *p++; -- this would be sufficient */ + /* but the following code achieves better performance for cases + * where format string is long and contains few conversions */ + const char *q = strchr(p+1,'%'); + size_t n = !q ? strlen(p) : (q-p); + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memcpy(str+str_l, p, (n>avail?avail:n)); + } + p += n; str_l += n; + } else { + const char *starting_p; + size_t min_field_width = 0, precision = 0; + int zero_padding = 0, precision_specified = 0, justify_left = 0; + int alternate_form = 0, force_sign = 0; + int space_for_positive = 1; /* If both the ' ' and '+' flags appear, + the ' ' flag should be ignored. */ + char length_modifier = '\0'; /* allowed values: \0, h, l, L */ + char tmp[32];/* temporary buffer for simple numeric->string conversion */ + + const char *str_arg; /* string address in case of string argument */ + size_t str_arg_l; /* natural field width of arg without padding + and sign */ + unsigned char uchar_arg; + /* unsigned char argument value - only defined for c conversion. + N.B. standard explicitly states the char argument for + the c conversion is unsigned */ + + size_t number_of_zeros_to_pad = 0; + /* number of zeros to be inserted for numeric conversions + as required by the precision or minimal field width */ + + size_t zero_padding_insertion_ind = 0; + /* index into tmp where zero padding is to be inserted */ + + char fmt_spec = '\0'; + /* current conversion specifier character */ + + str_arg = credits;/* just to make compiler happy (defined but not used)*/ + str_arg = NULL; + starting_p = p; p++; /* skip '%' */ + /* parse flags */ + while (*p == '0' || *p == '-' || *p == '+' || + *p == ' ' || *p == '#' || *p == '\'') { + switch (*p) { + case '0': zero_padding = 1; break; + case '-': justify_left = 1; break; + case '+': force_sign = 1; space_for_positive = 0; break; + case ' ': force_sign = 1; + /* If both the ' ' and '+' flags appear, the ' ' flag should be ignored */ +#ifdef PERL_COMPATIBLE + /* ... but in Perl the last of ' ' and '+' applies */ + space_for_positive = 1; +#endif + break; + case '#': alternate_form = 1; break; + case '\'': break; + } + p++; + } + /* If the '0' and '-' flags both appear, the '0' flag should be ignored. */ + + /* parse field width */ + if (*p == '*') { + int j; + p++; j = va_arg(ap, int); + if (j >= 0) min_field_width = j; + else { min_field_width = -j; justify_left = 1; } + } else if (isdigit((int)(*p))) { + /* size_t could be wider than unsigned int; + make sure we treat argument like common implementations do */ + unsigned int uj = *p++ - '0'; + while (isdigit((int)(*p))) uj = 10*uj + (unsigned int)(*p++ - '0'); + min_field_width = uj; + } + /* parse precision */ + if (*p == '.') { + p++; precision_specified = 1; + if (*p == '*') { + int j = va_arg(ap, int); + p++; + if (j >= 0) precision = j; + else { + precision_specified = 0; precision = 0; + /* NOTE: + * Solaris 2.6 man page claims that in this case the precision + * should be set to 0. Digital Unix 4.0, HPUX 10 and BSD man page + * claim that this case should be treated as unspecified precision, + * which is what we do here. + */ + } + } else if (isdigit((int)(*p))) { + /* size_t could be wider than unsigned int; + make sure we treat argument like common implementations do */ + unsigned int uj = *p++ - '0'; + while (isdigit((int)(*p))) uj = 10*uj + (unsigned int)(*p++ - '0'); + precision = uj; + } + } + /* parse 'h', 'l' and 'll' length modifiers */ + if (*p == 'h' || *p == 'l') { + length_modifier = *p; p++; + if (length_modifier == 'l' && *p == 'l') { /* double l = long long */ +#ifdef SNPRINTF_LONGLONG_SUPPORT + length_modifier = '2'; /* double l encoded as '2' */ +#else + length_modifier = 'l'; /* treat it as a single 'l' */ +#endif + p++; + } + } + fmt_spec = *p; + /* common synonyms: */ + switch (fmt_spec) { + case 'i': fmt_spec = 'd'; break; + case 'D': fmt_spec = 'd'; length_modifier = 'l'; break; + case 'U': fmt_spec = 'u'; length_modifier = 'l'; break; + case 'O': fmt_spec = 'o'; length_modifier = 'l'; break; + default: break; + } + /* get parameter value, do initial processing */ + switch (fmt_spec) { + case '%': /* % behaves similar to 's' regarding flags and field widths */ + case 'c': /* c behaves similar to 's' regarding flags and field widths */ + case 's': + length_modifier = '\0'; /* wint_t and wchar_t not supported */ + /* the result of zero padding flag with non-numeric conversion specifier*/ + /* is undefined. Solaris and HPUX 10 does zero padding in this case, */ + /* Digital Unix and Linux does not. */ +#if !defined(SOLARIS_COMPATIBLE) && !defined(HPUX_COMPATIBLE) + zero_padding = 0; /* turn zero padding off for string conversions */ +#endif + str_arg_l = 1; + switch (fmt_spec) { + case '%': + str_arg = p; break; + case 'c': { + int j = va_arg(ap, int); + uchar_arg = (unsigned char) j; /* standard demands unsigned char */ + str_arg = (const char *) &uchar_arg; + break; + } + case 's': + str_arg = va_arg(ap, const char *); + if (!str_arg) str_arg_l = 0; + /* make sure not to address string beyond the specified precision !!! */ + else if (!precision_specified) str_arg_l = strlen(str_arg); + /* truncate string if necessary as requested by precision */ + else if (precision == 0) str_arg_l = 0; + else { + /* memchr on HP does not like n > 2^31 !!! */ + const char *q = memchr(str_arg, '\0', + precision <= 0x7fffffff ? precision : 0x7fffffff); + str_arg_l = !q ? precision : (q-str_arg); + } + break; + default: break; + } + break; + case 'd': case 'u': case 'o': case 'x': case 'X': case 'p': { + /* NOTE: the u, o, x, X and p conversion specifiers imply + the value is unsigned; d implies a signed value */ + + int arg_sign = 0; + /* 0 if numeric argument is zero (or if pointer is NULL for 'p'), + +1 if greater than zero (or nonzero for unsigned arguments), + -1 if negative (unsigned argument is never negative) */ + + int int_arg = 0; unsigned int uint_arg = 0; + /* only defined for length modifier h, or for no length modifiers */ + + long int long_arg = 0; unsigned long int ulong_arg = 0; + /* only defined for length modifier l */ + + void *ptr_arg = NULL; + /* pointer argument value -only defined for p conversion */ + +#ifdef SNPRINTF_LONGLONG_SUPPORT + long long int long_long_arg = 0; + unsigned long long int ulong_long_arg = 0; + /* only defined for length modifier ll */ +#endif + if (fmt_spec == 'p') { + /* HPUX 10: An l, h, ll or L before any other conversion character + * (other than d, i, u, o, x, or X) is ignored. + * Digital Unix: + * not specified, but seems to behave as HPUX does. + * Solaris: If an h, l, or L appears before any other conversion + * specifier (other than d, i, u, o, x, or X), the behavior + * is undefined. (Actually %hp converts only 16-bits of address + * and %llp treats address as 64-bit data which is incompatible + * with (void *) argument on a 32-bit system). + */ +#ifdef SOLARIS_COMPATIBLE +# ifdef SOLARIS_BUG_COMPATIBLE + /* keep length modifiers even if it represents 'll' */ +# else + if (length_modifier == '2') length_modifier = '\0'; +# endif +#else + length_modifier = '\0'; +#endif + ptr_arg = va_arg(ap, void *); + if (ptr_arg != NULL) arg_sign = 1; + } else if (fmt_spec == 'd') { /* signed */ + switch (length_modifier) { + case '\0': + case 'h': + /* It is non-portable to specify a second argument of char or short + * to va_arg, because arguments seen by the called function + * are not char or short. C converts char and short arguments + * to int before passing them to a function. + */ + int_arg = va_arg(ap, int); + if (int_arg > 0) arg_sign = 1; + else if (int_arg < 0) arg_sign = -1; + break; + case 'l': + long_arg = va_arg(ap, long int); + if (long_arg > 0) arg_sign = 1; + else if (long_arg < 0) arg_sign = -1; + break; +#ifdef SNPRINTF_LONGLONG_SUPPORT + case '2': + long_long_arg = va_arg(ap, long long int); + if (long_long_arg > 0) arg_sign = 1; + else if (long_long_arg < 0) arg_sign = -1; + break; +#endif + } + } else { /* unsigned */ + switch (length_modifier) { + case '\0': + case 'h': + uint_arg = va_arg(ap, unsigned int); + if (uint_arg) arg_sign = 1; + break; + case 'l': + ulong_arg = va_arg(ap, unsigned long int); + if (ulong_arg) arg_sign = 1; + break; +#ifdef SNPRINTF_LONGLONG_SUPPORT + case '2': + ulong_long_arg = va_arg(ap, unsigned long long int); + if (ulong_long_arg) arg_sign = 1; + break; +#endif + } + } + str_arg = tmp; str_arg_l = 0; + /* NOTE: + * For d, i, u, o, x, and X conversions, if precision is specified, + * the '0' flag should be ignored. This is so with Solaris 2.6, + * Digital UNIX 4.0, HPUX 10, Linux, FreeBSD, NetBSD; but not with Perl. + */ +#ifndef PERL_COMPATIBLE + if (precision_specified) zero_padding = 0; +#endif + if (fmt_spec == 'd') { + if (force_sign && arg_sign >= 0) + tmp[str_arg_l++] = space_for_positive ? ' ' : '+'; + /* leave negative numbers for sprintf to handle, + to avoid handling tricky cases like (short int)(-32768) */ +#ifdef LINUX_COMPATIBLE + } else if (fmt_spec == 'p' && force_sign && arg_sign > 0) { + tmp[str_arg_l++] = space_for_positive ? ' ' : '+'; +#endif + } else if (alternate_form) { + if (arg_sign != 0 && (fmt_spec == 'x' || fmt_spec == 'X') ) + { tmp[str_arg_l++] = '0'; tmp[str_arg_l++] = fmt_spec; } + /* alternate form should have no effect for p conversion, but ... */ +#ifdef HPUX_COMPATIBLE + else if (fmt_spec == 'p' + /* HPUX 10: for an alternate form of p conversion, + * a nonzero result is prefixed by 0x. */ +#ifndef HPUX_BUG_COMPATIBLE + /* Actually it uses 0x prefix even for a zero value. */ + && arg_sign != 0 +#endif + ) { tmp[str_arg_l++] = '0'; tmp[str_arg_l++] = 'x'; } +#endif + } + zero_padding_insertion_ind = str_arg_l; + if (!precision_specified) precision = 1; /* default precision is 1 */ + if (precision == 0 && arg_sign == 0 +#if defined(HPUX_BUG_COMPATIBLE) || defined(LINUX_COMPATIBLE) + && fmt_spec != 'p' + /* HPUX 10 man page claims: With conversion character p the result of + * converting a zero value with a precision of zero is a null string. + * Actually HP returns all zeroes, and Linux returns "(nil)". */ +#endif + ) { + /* converted to null string */ + /* When zero value is formatted with an explicit precision 0, + the resulting formatted string is empty (d, i, u, o, x, X, p). */ + } else { + char f[5]; int f_l = 0; + f[f_l++] = '%'; /* construct a simple format string for sprintf */ + if (!length_modifier) { } + else if (length_modifier=='2') { f[f_l++] = 'l'; f[f_l++] = 'l'; } + else f[f_l++] = length_modifier; + f[f_l++] = fmt_spec; f[f_l++] = '\0'; + if (fmt_spec == 'p') str_arg_l += sprintf(tmp+str_arg_l, f, ptr_arg); + else if (fmt_spec == 'd') { /* signed */ + switch (length_modifier) { + case '\0': + case 'h': str_arg_l+=sprintf(tmp+str_arg_l, f, int_arg); break; + case 'l': str_arg_l+=sprintf(tmp+str_arg_l, f, long_arg); break; +#ifdef SNPRINTF_LONGLONG_SUPPORT + case '2': str_arg_l+=sprintf(tmp+str_arg_l,f,long_long_arg); break; +#endif + } + } else { /* unsigned */ + switch (length_modifier) { + case '\0': + case 'h': str_arg_l+=sprintf(tmp+str_arg_l, f, uint_arg); break; + case 'l': str_arg_l+=sprintf(tmp+str_arg_l, f, ulong_arg); break; +#ifdef SNPRINTF_LONGLONG_SUPPORT + case '2': str_arg_l+=sprintf(tmp+str_arg_l,f,ulong_long_arg);break; +#endif + } + } + /* include the optional minus sign and possible "0x" + in the region before the zero padding insertion point */ + if (zero_padding_insertion_ind < str_arg_l && + tmp[zero_padding_insertion_ind] == '-') { + zero_padding_insertion_ind++; + } + if (zero_padding_insertion_ind+1 < str_arg_l && + tmp[zero_padding_insertion_ind] == '0' && + (tmp[zero_padding_insertion_ind+1] == 'x' || + tmp[zero_padding_insertion_ind+1] == 'X') ) { + zero_padding_insertion_ind += 2; + } + } + { size_t num_of_digits = str_arg_l - zero_padding_insertion_ind; + if (alternate_form && fmt_spec == 'o' +#ifdef HPUX_COMPATIBLE /* ("%#.o",0) -> "" */ + && (str_arg_l > 0) +#endif +#ifdef DIGITAL_UNIX_BUG_COMPATIBLE /* ("%#o",0) -> "00" */ +#else + /* unless zero is already the first character */ + && !(zero_padding_insertion_ind < str_arg_l + && tmp[zero_padding_insertion_ind] == '0') +#endif + ) { /* assure leading zero for alternate-form octal numbers */ + if (!precision_specified || precision < num_of_digits+1) { + /* precision is increased to force the first character to be zero, + except if a zero value is formatted with an explicit precision + of zero */ + precision = num_of_digits+1; precision_specified = 1; + } + } + /* zero padding to specified precision? */ + if (num_of_digits < precision) + number_of_zeros_to_pad = precision - num_of_digits; + } + /* zero padding to specified minimal field width? */ + if (!justify_left && zero_padding) { + int n = min_field_width - (str_arg_l+number_of_zeros_to_pad); + if (n > 0) number_of_zeros_to_pad += n; + } + break; + } + default: /* unrecognized conversion specifier, keep format string as-is*/ + zero_padding = 0; /* turn zero padding off for non-numeric convers. */ +#ifndef DIGITAL_UNIX_COMPATIBLE + justify_left = 1; min_field_width = 0; /* reset flags */ +#endif +#if defined(PERL_COMPATIBLE) || defined(LINUX_COMPATIBLE) + /* keep the entire format string unchanged */ + str_arg = starting_p; str_arg_l = p - starting_p; + /* well, not exactly so for Linux, which does something inbetween, + * and I don't feel an urge to imitate it: "%+++++hy" -> "%+y" */ +#else + /* discard the unrecognized conversion, just keep * + * the unrecognized conversion character */ + str_arg = p; str_arg_l = 0; +#endif + if (*p) str_arg_l++; /* include invalid conversion specifier unchanged + if not at end-of-string */ + break; + } + if (*p) p++; /* step over the just processed conversion specifier */ + /* insert padding to the left as requested by min_field_width; + this does not include the zero padding in case of numerical conversions*/ + if (!justify_left) { /* left padding with blank or zero */ + int n = min_field_width - (str_arg_l+number_of_zeros_to_pad); + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memset(str+str_l, (zero_padding?'0':' '), (n>avail?avail:n)); + } + str_l += n; + } + } + /* zero padding as requested by the precision or by the minimal field width + * for numeric conversions required? */ + if (number_of_zeros_to_pad <= 0) { + /* will not copy first part of numeric right now, * + * force it to be copied later in its entirety */ + zero_padding_insertion_ind = 0; + } else { + /* insert first part of numerics (sign or '0x') before zero padding */ + int n = zero_padding_insertion_ind; + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memcpy(str+str_l, str_arg, (n>avail?avail:n)); + } + str_l += n; + } + /* insert zero padding as requested by the precision or min field width */ + n = number_of_zeros_to_pad; + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memset(str+str_l, '0', (n>avail?avail:n)); + } + str_l += n; + } + } + /* insert formatted string + * (or as-is conversion specifier for unknown conversions) */ + { int n = str_arg_l - zero_padding_insertion_ind; + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memcpy(str+str_l, str_arg+zero_padding_insertion_ind, + (n>avail?avail:n)); + } + str_l += n; + } + } + /* insert right padding */ + if (justify_left) { /* right blank padding to the field width */ + int n = min_field_width - (str_arg_l+number_of_zeros_to_pad); + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memset(str+str_l, ' ', (n>avail?avail:n)); + } + str_l += n; + } + } + } + } +#if defined(NEED_SNPRINTF_ONLY) + va_end(ap); +#endif + if (str_m > 0) { /* make sure the string is null-terminated + even at the expense of overwriting the last character + (shouldn't happen, but just in case) */ + str[str_l <= str_m-1 ? str_l : str_m-1] = '\0'; + } + /* Return the number of characters formatted (excluding trailing null + * character), that is, the number of characters that would have been + * written to the buffer if it were large enough. + * + * The value of str_l should be returned, but str_l is of unsigned type + * size_t, and snprintf is int, possibly leading to an undetected + * integer overflow, resulting in a negative return value, which is illegal. + * Both XSH5 and ISO C99 (at least the draft) are silent on this issue. + * Should errno be set to EOVERFLOW and EOF returned in this case??? + */ + return (int) str_l; +} +#endif Index: branches/apertium-tagger/apertium2/apertium/win32/snprintf.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/snprintf.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/snprintf.h (revision 69632) @@ -0,0 +1,26 @@ +#ifndef _PORTABLE_SNPRINTF_H_ +#define _PORTABLE_SNPRINTF_H_ + +#define PORTABLE_SNPRINTF_VERSION_MAJOR 2 +#define PORTABLE_SNPRINTF_VERSION_MINOR 2 + +#ifdef HAVE_SNPRINTF +#include +#else +extern int snprintf(char *, size_t, const char *, /*args*/ ...); +extern int vsnprintf(char *, size_t, const char *, va_list); +#endif + +#if defined(HAVE_SNPRINTF) && defined(PREFER_PORTABLE_SNPRINTF) +extern int portable_snprintf(char *str, size_t str_m, const char *fmt, /*args*/ ...); +extern int portable_vsnprintf(char *str, size_t str_m, const char *fmt, va_list ap); +#define snprintf portable_snprintf +#define vsnprintf portable_vsnprintf +#endif + +extern int asprintf (char **ptr, const char *fmt, /*args*/ ...); +extern int vasprintf (char **ptr, const char *fmt, va_list ap); +extern int asnprintf (char **ptr, size_t str_m, const char *fmt, /*args*/ ...); +extern int vasnprintf(char **ptr, size_t str_m, const char *fmt, va_list ap); + +#endif Index: branches/apertium-tagger/apertium2/apertium/win32/libgen.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/libgen.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/libgen.c (revision 69632) @@ -0,0 +1,25 @@ +#include + +#include "libgen.h" + +// http://www.opengroup.org/onlinepubs/007908775/xsh/basename.html + +char* basename(char *path) { + if (path != NULL) { + // Find the last position of the \ in the path name + char* pos = strrchr(path, '\\'); + + if (pos != NULL) { // If a \ char was found... + if (pos + 1 != NULL) // If it is not the last character in the string... + return pos + 1; // then return a pointer to the first character after \. + else + return pos; // else return a pointer to \ + + } else { // If a \ char was NOT found + return path; // return the pointer passed to basename (this is probably non-conformant) + } + + } else { // If path == NULL, return "." + return "."; + } +} Index: branches/apertium-tagger/apertium2/apertium/win32/libgen.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/libgen.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/libgen.h (revision 69632) @@ -0,0 +1,14 @@ +#ifndef LIBGEN_H +#define LIBGEN_H + +#ifdef __cplusplus + extern "C" { +#endif + +char *basename(char *); + +#ifdef __cplusplus + } +#endif + +#endif Index: branches/apertium-tagger/apertium2/apertium/win32/regex.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/regex.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/regex.c (revision 69632) @@ -0,0 +1,4948 @@ +/* Extended regular expression matching and search library, + version 0.12. + (Implements POSIX draft P10003.2/D11.2, except for + internationalization features.) + + Copyright (C) 1993 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* AIX requires this to be the first thing in the file. */ +#if defined (_AIX) && !defined (REGEX_MALLOC) + #pragma alloca +#endif + +#define _GNU_SOURCE + +/* We need this for `regex.h', and perhaps for the Emacs include files. */ +#include + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* The `emacs' switch turns on certain matching commands + that make sense only in Emacs. */ +#ifdef emacs + +#include "lisp.h" +#include "buffer.h" +#include "syntax.h" + +/* Emacs uses `NULL' as a predicate. */ +#undef NULL + +#else /* not emacs */ + +/* We used to test for `BSTRING' here, but only GCC and Emacs define + `BSTRING', as far as I know, and neither of them use this code. */ +#if HAVE_STRING_H || STDC_HEADERS +#include +#ifndef bcmp +#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) +#endif +#ifndef bcopy +#define bcopy(s, d, n) memcpy ((d), (s), (n)) +#endif +#ifndef bzero +#define bzero(s, n) memset ((s), 0, (n)) +#endif +#else +#include +#endif + +#ifdef STDC_HEADERS +#include +#else +char *malloc (); +char *realloc (); +#endif + + +/* Define the syntax stuff for \<, \>, etc. */ + +/* This must be nonzero for the wordchar and notwordchar pattern + commands in re_match_2. */ +#ifndef Sword +#define Sword 1 +#endif + +#ifdef SYNTAX_TABLE + +extern char *re_syntax_table; + +#else /* not SYNTAX_TABLE */ + +/* How many characters in the character set. */ +#define CHAR_SET_SIZE 256 + +static char re_syntax_table[CHAR_SET_SIZE]; + +static void +init_syntax_once () +{ + register int c; + static int done = 0; + + if (done) + return; + + bzero (re_syntax_table, sizeof re_syntax_table); + + for (c = 'a'; c <= 'z'; c++) + re_syntax_table[c] = Sword; + + for (c = 'A'; c <= 'Z'; c++) + re_syntax_table[c] = Sword; + + for (c = '0'; c <= '9'; c++) + re_syntax_table[c] = Sword; + + re_syntax_table['_'] = Sword; + + done = 1; +} + +#endif /* not SYNTAX_TABLE */ + +#define SYNTAX(c) re_syntax_table[c] + +#endif /* not emacs */ + +/* Get the interface, including the syntax bits. */ +#include "regex.h" + +/* isalpha etc. are used for the character classes. */ +#include + +#ifndef isascii +#define isascii(c) 1 +#endif + +#ifdef isblank +#define ISBLANK(c) (isascii (c) && isblank (c)) +#else +#define ISBLANK(c) ((c) == ' ' || (c) == '\t') +#endif +#ifdef isgraph +#define ISGRAPH(c) (isascii (c) && isgraph (c)) +#else +#define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) +#endif + +#define ISPRINT(c) (isascii (c) && isprint (c)) +#define ISDIGIT(c) (isascii (c) && isdigit (c)) +#define ISALNUM(c) (isascii (c) && isalnum (c)) +#define ISALPHA(c) (isascii (c) && isalpha (c)) +#define ISCNTRL(c) (isascii (c) && iscntrl (c)) +#define ISLOWER(c) (isascii (c) && islower (c)) +#define ISPUNCT(c) (isascii (c) && ispunct (c)) +#define ISSPACE(c) (isascii (c) && isspace (c)) +#define ISUPPER(c) (isascii (c) && isupper (c)) +#define ISXDIGIT(c) (isascii (c) && isxdigit (c)) + +#ifndef NULL +#define NULL 0 +#endif + +/* We remove any previous definition of `SIGN_EXTEND_CHAR', + since ours (we hope) works properly with all combinations of + machines, compilers, `char' and `unsigned char' argument types. + (Per Bothner suggested the basic approach.) */ +#undef SIGN_EXTEND_CHAR +#if __STDC__ +#define SIGN_EXTEND_CHAR(c) ((signed char) (c)) +#else /* not __STDC__ */ +/* As in Harbison and Steele. */ +#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) +#endif + +/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we + use `alloca' instead of `malloc'. This is because using malloc in + re_search* or re_match* could cause memory leaks when C-g is used in + Emacs; also, malloc is slower and causes storage fragmentation. On + the other hand, malloc is more portable, and easier to debug. + + Because we sometimes use alloca, some routines have to be macros, + not functions -- `alloca'-allocated space disappears at the end of the + function it is called in. */ + +#ifdef REGEX_MALLOC + +#define REGEX_ALLOCATE malloc +#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) + +#else /* not REGEX_MALLOC */ + +/* Emacs already defines alloca, sometimes. */ +#ifndef alloca + +/* Make alloca work the best possible way. */ +#ifdef __GNUC__ +#define alloca __builtin_alloca +#else /* not __GNUC__ */ +#if HAVE_ALLOCA_H +#include +#else /* not __GNUC__ or HAVE_ALLOCA_H */ +#ifndef _AIX /* Already did AIX, up at the top. */ +char *alloca (); +#endif /* not _AIX */ +#endif /* not HAVE_ALLOCA_H */ +#endif /* not __GNUC__ */ + +#endif /* not alloca */ + +#define REGEX_ALLOCATE alloca + +/* Assumes a `char *destination' variable. */ +#define REGEX_REALLOCATE(source, osize, nsize) \ + (destination = (char *) alloca (nsize), \ + bcopy (source, destination, osize), \ + destination) + +#endif /* not REGEX_MALLOC */ + + +/* True if `size1' is non-NULL and PTR is pointing anywhere inside + `string1' or just past its end. This works if PTR is NULL, which is + a good thing. */ +#define FIRST_STRING_P(ptr) \ + (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) + +/* (Re)Allocate N items of type T using malloc, or fail. */ +#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) +#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) +#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) + +#define BYTEWIDTH 8 /* In bits. */ + +#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +typedef char boolean; +#define false 0 +#define true 1 + +/* These are the command codes that appear in compiled regular + expressions. Some opcodes are followed by argument bytes. A + command code can specify any interpretation whatsoever for its + arguments. Zero bytes may appear in the compiled regular expression. + + The value of `exactn' is needed in search.c (search_buffer) in Emacs. + So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of + `exactn' we use here must also be 1. */ + +typedef enum +{ + no_op = 0, + + /* Followed by one byte giving n, then by n literal bytes. */ + exactn = 1, + + /* Matches any (more or less) character. */ + anychar, + + /* Matches any one char belonging to specified set. First + following byte is number of bitmap bytes. Then come bytes + for a bitmap saying which chars are in. Bits in each byte + are ordered low-bit-first. A character is in the set if its + bit is 1. A character too large to have a bit in the map is + automatically not in the set. */ + charset, + + /* Same parameters as charset, but match any character that is + not one of those specified. */ + charset_not, + + /* Start remembering the text that is matched, for storing in a + register. Followed by one byte with the register number, in + the range 0 to one less than the pattern buffer's re_nsub + field. Then followed by one byte with the number of groups + inner to this one. (This last has to be part of the + start_memory only because we need it in the on_failure_jump + of re_match_2.) */ + start_memory, + + /* Stop remembering the text that is matched and store it in a + memory register. Followed by one byte with the register + number, in the range 0 to one less than `re_nsub' in the + pattern buffer, and one byte with the number of inner groups, + just like `start_memory'. (We need the number of inner + groups here because we don't have any easy way of finding the + corresponding start_memory when we're at a stop_memory.) */ + stop_memory, + + /* Match a duplicate of something remembered. Followed by one + byte containing the register number. */ + duplicate, + + /* Fail unless at beginning of line. */ + begline, + + /* Fail unless at end of line. */ + endline, + + /* Succeeds if at beginning of buffer (if emacs) or at beginning + of string to be matched (if not). */ + begbuf, + + /* Analogously, for end of buffer/string. */ + endbuf, + + /* Followed by two byte relative address to which to jump. */ + jump, + + /* Same as jump, but marks the end of an alternative. */ + jump_past_alt, + + /* Followed by two-byte relative address of place to resume at + in case of failure. */ + on_failure_jump, + + /* Like on_failure_jump, but pushes a placeholder instead of the + current string position when executed. */ + on_failure_keep_string_jump, + + /* Throw away latest failure point and then jump to following + two-byte relative address. */ + pop_failure_jump, + + /* Change to pop_failure_jump if know won't have to backtrack to + match; otherwise change to jump. This is used to jump + back to the beginning of a repeat. If what follows this jump + clearly won't match what the repeat does, such that we can be + sure that there is no use backtracking out of repetitions + already matched, then we change it to a pop_failure_jump. + Followed by two-byte address. */ + maybe_pop_jump, + + /* Jump to following two-byte address, and push a dummy failure + point. This failure point will be thrown away if an attempt + is made to use it for a failure. A `+' construct makes this + before the first repeat. Also used as an intermediary kind + of jump when compiling an alternative. */ + dummy_failure_jump, + + /* Push a dummy failure point and continue. Used at the end of + alternatives. */ + push_dummy_failure, + + /* Followed by two-byte relative address and two-byte number n. + After matching N times, jump to the address upon failure. */ + succeed_n, + + /* Followed by two-byte relative address, and two-byte number n. + Jump to the address N times, then fail. */ + jump_n, + + /* Set the following two-byte relative address to the + subsequent two-byte number. The address *includes* the two + bytes of number. */ + set_number_at, + + wordchar, /* Matches any word-constituent character. */ + notwordchar, /* Matches any char that is not a word-constituent. */ + + wordbeg, /* Succeeds if at word beginning. */ + wordend, /* Succeeds if at word end. */ + + wordbound, /* Succeeds if at a word boundary. */ + notwordbound /* Succeeds if not at a word boundary. */ + +#ifdef emacs + ,before_dot, /* Succeeds if before point. */ + at_dot, /* Succeeds if at point. */ + after_dot, /* Succeeds if after point. */ + + /* Matches any character whose syntax is specified. Followed by + a byte which contains a syntax code, e.g., Sword. */ + syntaxspec, + + /* Matches any character whose syntax is not that specified. */ + notsyntaxspec +#endif /* emacs */ +} re_opcode_t; + +/* Common operations on the compiled pattern. */ + +/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ + +#define STORE_NUMBER(destination, number) \ + do { \ + (destination)[0] = (number) & 0377; \ + (destination)[1] = (number) >> 8; \ + } while (0) + +/* Same as STORE_NUMBER, except increment DESTINATION to + the byte after where the number is stored. Therefore, DESTINATION + must be an lvalue. */ + +#define STORE_NUMBER_AND_INCR(destination, number) \ + do { \ + STORE_NUMBER (destination, number); \ + (destination) += 2; \ + } while (0) + +/* Put into DESTINATION a number stored in two contiguous bytes starting + at SOURCE. */ + +#define EXTRACT_NUMBER(destination, source) \ + do { \ + (destination) = *(source) & 0377; \ + (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ + } while (0) + +#ifdef DEBUG +static void +extract_number (dest, source) + int *dest; + unsigned char *source; +{ + int temp = SIGN_EXTEND_CHAR (*(source + 1)); + *dest = *source & 0377; + *dest += temp << 8; +} + +#ifndef EXTRACT_MACROS /* To debug the macros. */ +#undef EXTRACT_NUMBER +#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) +#endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. + SOURCE must be an lvalue. */ + +#define EXTRACT_NUMBER_AND_INCR(destination, source) \ + do { \ + EXTRACT_NUMBER (destination, source); \ + (source) += 2; \ + } while (0) + +#ifdef DEBUG +static void +extract_number_and_incr (destination, source) + int *destination; + unsigned char **source; +{ + extract_number (destination, *source); + *source += 2; +} + +#ifndef EXTRACT_MACROS +#undef EXTRACT_NUMBER_AND_INCR +#define EXTRACT_NUMBER_AND_INCR(dest, src) \ + extract_number_and_incr (&dest, &src) +#endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* If DEBUG is defined, Regex prints many voluminous messages about what + it is doing (if the variable `debug' is nonzero). If linked with the + main program in `iregex.c', you can enter patterns and strings + interactively. And if linked with the main program in `main.c' and + the other test files, you can run the already-written tests. */ + +#ifdef DEBUG + +/* We use standard I/O for debugging. */ +#include + +/* It is useful to test things that ``must'' be true when debugging. */ +#include + +static int debug = 0; + +#define DEBUG_STATEMENT(e) e +#define DEBUG_PRINT1(x) if (debug) printf (x) +#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) +#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) +#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) +#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ + if (debug) print_partial_compiled_pattern (s, e) +#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ + if (debug) print_double_string (w, s1, sz1, s2, sz2) + + +extern void printchar (); + +/* Print the fastmap in human-readable form. */ + +void +print_fastmap (fastmap) + char *fastmap; +{ + unsigned was_a_range = 0; + unsigned i = 0; + + while (i < (1 << BYTEWIDTH)) + { + if (fastmap[i++]) + { + was_a_range = 0; + printchar (i - 1); + while (i < (1 << BYTEWIDTH) && fastmap[i]) + { + was_a_range = 1; + i++; + } + if (was_a_range) + { + printf ("-"); + printchar (i - 1); + } + } + } + putchar ('\n'); +} + + +/* Print a compiled pattern string in human-readable form, starting at + the START pointer into it and ending just before the pointer END. */ + +void +print_partial_compiled_pattern (start, end) + unsigned char *start; + unsigned char *end; +{ + int mcnt, mcnt2; + unsigned char *p = start; + unsigned char *pend = end; + + if (start == NULL) + { + printf ("(null)\n"); + return; + } + + /* Loop over pattern commands. */ + while (p < pend) + { + switch ((re_opcode_t) *p++) + { + case no_op: + printf ("/no_op"); + break; + + case exactn: + mcnt = *p++; + printf ("/exactn/%d", mcnt); + do + { + putchar ('/'); + printchar (*p++); + } + while (--mcnt); + break; + + case start_memory: + mcnt = *p++; + printf ("/start_memory/%d/%d", mcnt, *p++); + break; + + case stop_memory: + mcnt = *p++; + printf ("/stop_memory/%d/%d", mcnt, *p++); + break; + + case duplicate: + printf ("/duplicate/%d", *p++); + break; + + case anychar: + printf ("/anychar"); + break; + + case charset: + case charset_not: + { + register int c; + + printf ("/charset%s", + (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); + + assert (p + *p < pend); + + for (c = 0; c < *p; c++) + { + unsigned bit; + unsigned char map_byte = p[1 + c]; + + putchar ('/'); + + for (bit = 0; bit < BYTEWIDTH; bit++) + if (map_byte & (1 << bit)) + printchar (c * BYTEWIDTH + bit); + } + p += 1 + *p; + break; + } + + case begline: + printf ("/begline"); + break; + + case endline: + printf ("/endline"); + break; + + case on_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_jump/0/%d", mcnt); + break; + + case on_failure_keep_string_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_keep_string_jump/0/%d", mcnt); + break; + + case dummy_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/dummy_failure_jump/0/%d", mcnt); + break; + + case push_dummy_failure: + printf ("/push_dummy_failure"); + break; + + case maybe_pop_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/maybe_pop_jump/0/%d", mcnt); + break; + + case pop_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/pop_failure_jump/0/%d", mcnt); + break; + + case jump_past_alt: + extract_number_and_incr (&mcnt, &p); + printf ("/jump_past_alt/0/%d", mcnt); + break; + + case jump: + extract_number_and_incr (&mcnt, &p); + printf ("/jump/0/%d", mcnt); + break; + + case succeed_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2); + break; + + case jump_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2); + break; + + case set_number_at: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2); + break; + + case wordbound: + printf ("/wordbound"); + break; + + case notwordbound: + printf ("/notwordbound"); + break; + + case wordbeg: + printf ("/wordbeg"); + break; + + case wordend: + printf ("/wordend"); + +#ifdef emacs + case before_dot: + printf ("/before_dot"); + break; + + case at_dot: + printf ("/at_dot"); + break; + + case after_dot: + printf ("/after_dot"); + break; + + case syntaxspec: + printf ("/syntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; + + case notsyntaxspec: + printf ("/notsyntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; +#endif /* emacs */ + + case wordchar: + printf ("/wordchar"); + break; + + case notwordchar: + printf ("/notwordchar"); + break; + + case begbuf: + printf ("/begbuf"); + break; + + case endbuf: + printf ("/endbuf"); + break; + + default: + printf ("?%d", *(p-1)); + } + } + printf ("/\n"); +} + + +void +print_compiled_pattern (bufp) + struct re_pattern_buffer *bufp; +{ + unsigned char *buffer = bufp->buffer; + + print_partial_compiled_pattern (buffer, buffer + bufp->used); + printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); + + if (bufp->fastmap_accurate && bufp->fastmap) + { + printf ("fastmap: "); + print_fastmap (bufp->fastmap); + } + + printf ("re_nsub: %d\t", bufp->re_nsub); + printf ("regs_alloc: %d\t", bufp->regs_allocated); + printf ("can_be_null: %d\t", bufp->can_be_null); + printf ("newline_anchor: %d\n", bufp->newline_anchor); + printf ("no_sub: %d\t", bufp->no_sub); + printf ("not_bol: %d\t", bufp->not_bol); + printf ("not_eol: %d\t", bufp->not_eol); + printf ("syntax: %d\n", bufp->syntax); + /* Perhaps we should print the translate table? */ +} + + +void +print_double_string (where, string1, size1, string2, size2) + const char *where; + const char *string1; + const char *string2; + int size1; + int size2; +{ + unsigned this_char; + + if (where == NULL) + printf ("(null)"); + else + { + if (FIRST_STRING_P (where)) + { + for (this_char = where - string1; this_char < size1; this_char++) + printchar (string1[this_char]); + + where = string2; + } + + for (this_char = where - string2; this_char < size2; this_char++) + printchar (string2[this_char]); + } +} + +#else /* not DEBUG */ + +#undef assert +#define assert(e) + +#define DEBUG_STATEMENT(e) +#define DEBUG_PRINT1(x) +#define DEBUG_PRINT2(x1, x2) +#define DEBUG_PRINT3(x1, x2, x3) +#define DEBUG_PRINT4(x1, x2, x3, x4) +#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) +#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) + +#endif /* not DEBUG */ + +/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ +reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; + + +/* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + +reg_syntax_t +re_set_syntax (syntax) + reg_syntax_t syntax; +{ + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; + return ret; +} + +/* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. */ + +static const char *re_error_msg[] = + { NULL, /* REG_NOERROR */ + "No match", /* REG_NOMATCH */ + "Invalid regular expression", /* REG_BADPAT */ + "Invalid collation character", /* REG_ECOLLATE */ + "Invalid character class name", /* REG_ECTYPE */ + "Trailing backslash", /* REG_EESCAPE */ + "Invalid back reference", /* REG_ESUBREG */ + "Unmatched [ or [^", /* REG_EBRACK */ + "Unmatched ( or \\(", /* REG_EPAREN */ + "Unmatched \\{", /* REG_EBRACE */ + "Invalid content of \\{\\}", /* REG_BADBR */ + "Invalid range end", /* REG_ERANGE */ + "Memory exhausted", /* REG_ESPACE */ + "Invalid preceding regular expression", /* REG_BADRPT */ + "Premature end of regular expression", /* REG_EEND */ + "Regular expression too big", /* REG_ESIZE */ + "Unmatched ) or \\)", /* REG_ERPAREN */ + }; + +/* Subroutine declarations and macros for regex_compile. */ + +static void store_op1 (), store_op2 (); +static void insert_op1 (), insert_op2 (); +static boolean at_begline_loc_p (), at_endline_loc_p (); +static boolean group_in_compile_stack (); +static reg_errcode_t compile_range (); + +/* Fetch the next character in the uncompiled pattern---translating it + if necessary. Also cast from a signed character in the constant + string passed to us by the user to an unsigned char that we can use + as an array index (in, e.g., `translate'). */ +#define PATFETCH(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + if (translate) c = translate[c]; \ + } while (0) + +/* Fetch the next character in the uncompiled pattern, with no + translation. */ +#define PATFETCH_RAW(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + } while (0) + +/* Go backwards one character in the pattern. */ +#define PATUNFETCH p-- + + +/* If `translate' is non-null, return translate[D], else just D. We + cast the subscript to translate because some data is declared as + `char *', to avoid warnings when a string constant is passed. But + when we use a character as a subscript we must make it unsigned. */ +#define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) + + +/* Macros for outputting the compiled pattern into `buffer'. */ + +/* If the buffer isn't allocated when it comes in, use this. */ +#define INIT_BUF_SIZE 32 + +/* Make sure we have at least N more bytes of space in buffer. */ +#define GET_BUFFER_SPACE(n) \ + while (b - bufp->buffer + (n) > bufp->allocated) \ + EXTEND_BUFFER () + +/* Make sure we have one more byte of buffer space and then add C to it. */ +#define BUF_PUSH(c) \ + do { \ + GET_BUFFER_SPACE (1); \ + *b++ = (unsigned char) (c); \ + } while (0) + + +/* Ensure we have two more bytes of buffer space and then append C1 and C2. */ +#define BUF_PUSH_2(c1, c2) \ + do { \ + GET_BUFFER_SPACE (2); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + } while (0) + + +/* As with BUF_PUSH_2, except for three bytes. */ +#define BUF_PUSH_3(c1, c2, c3) \ + do { \ + GET_BUFFER_SPACE (3); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + *b++ = (unsigned char) (c3); \ + } while (0) + + +/* Store a jump with opcode OP at LOC to location TO. We store a + relative address offset by the three bytes the jump itself occupies. */ +#define STORE_JUMP(op, loc, to) \ + store_op1 (op, loc, (to) - (loc) - 3) + +/* Likewise, for a two-argument jump. */ +#define STORE_JUMP2(op, loc, to, arg) \ + store_op2 (op, loc, (to) - (loc) - 3, arg) + +/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP(op, loc, to) \ + insert_op1 (op, loc, (to) - (loc) - 3, b) + +/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP2(op, loc, to, arg) \ + insert_op2 (op, loc, (to) - (loc) - 3, arg, b) + + +/* This is not an arbitrary limit: the arguments which represent offsets + into the pattern are two bytes long. So if 2^16 bytes turns out to + be too small, many things would have to change. */ +#define MAX_BUF_SIZE (1L << 16) + + +/* Extend the buffer by twice its current size via realloc and + reset the pointers that pointed into the old block to point to the + correct places in the new one. If extending the buffer results in it + being larger than MAX_BUF_SIZE, then flag memory exhausted. */ +#define EXTEND_BUFFER() \ + do { \ + unsigned char *old_buffer = bufp->buffer; \ + if (bufp->allocated == MAX_BUF_SIZE) \ + return REG_ESIZE; \ + bufp->allocated <<= 1; \ + if (bufp->allocated > MAX_BUF_SIZE) \ + bufp->allocated = MAX_BUF_SIZE; \ + bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ + if (bufp->buffer == NULL) \ + return REG_ESPACE; \ + /* If the buffer moved, move all the pointers into it. */ \ + if (old_buffer != bufp->buffer) \ + { \ + b = (b - old_buffer) + bufp->buffer; \ + begalt = (begalt - old_buffer) + bufp->buffer; \ + if (fixup_alt_jump) \ + fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ + if (laststart) \ + laststart = (laststart - old_buffer) + bufp->buffer; \ + if (pending_exact) \ + pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ + } \ + } while (0) + + +/* Since we have one byte reserved for the register number argument to + {start,stop}_memory, the maximum number of groups we can report + things about is what fits in that byte. */ +#define MAX_REGNUM 255 + +/* But patterns can have more than `MAX_REGNUM' registers. We just + ignore the excess. */ +typedef unsigned regnum_t; + + +/* Macros for the compile stack. */ + +/* Since offsets can go either forwards or backwards, this type needs to + be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ +typedef int pattern_offset_t; + +typedef struct +{ + pattern_offset_t begalt_offset; + pattern_offset_t fixup_alt_jump; + pattern_offset_t inner_group_offset; + pattern_offset_t laststart_offset; + regnum_t regnum; +} compile_stack_elt_t; + + +typedef struct +{ + compile_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} compile_stack_type; + + +#define INIT_COMPILE_STACK_SIZE 32 + +#define COMPILE_STACK_EMPTY (compile_stack.avail == 0) +#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) + +/* The next available element. */ +#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) + + +/* Set the bit for character C in a list. */ +#define SET_LIST_BIT(c) \ + (b[((unsigned char) (c)) / BYTEWIDTH] \ + |= 1 << (((unsigned char) c) % BYTEWIDTH)) + + +/* Get the next unsigned number in the uncompiled pattern. */ +#define GET_UNSIGNED_NUMBER(num) \ + { if (p != pend) \ + { \ + PATFETCH (c); \ + while (ISDIGIT (c)) \ + { \ + if (num < 0) \ + num = 0; \ + num = num * 10 + c - '0'; \ + if (p == pend) \ + break; \ + PATFETCH (c); \ + } \ + } \ + } + +#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ + +#define IS_CHAR_CLASS(string) \ + (STREQ (string, "alpha") || STREQ (string, "upper") \ + || STREQ (string, "lower") || STREQ (string, "digit") \ + || STREQ (string, "alnum") || STREQ (string, "xdigit") \ + || STREQ (string, "space") || STREQ (string, "print") \ + || STREQ (string, "punct") || STREQ (string, "graph") \ + || STREQ (string, "cntrl") || STREQ (string, "blank")) + +/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. + Returns one of error codes defined in `regex.h', or zero for success. + + Assumes the `allocated' (and perhaps `buffer') and `translate' + fields are set in BUFP on entry. + + If it succeeds, results are put in BUFP (if it returns an error, the + contents of BUFP are undefined): + `buffer' is the compiled pattern; + `syntax' is set to SYNTAX; + `used' is set to the length of the compiled pattern; + `fastmap_accurate' is zero; + `re_nsub' is the number of subexpressions in PATTERN; + `not_bol' and `not_eol' are zero; + + The `fastmap' and `newline_anchor' fields are neither + examined nor set. */ + +static reg_errcode_t +regex_compile (pattern, size, syntax, bufp) + const char *pattern; + int size; + reg_syntax_t syntax; + struct re_pattern_buffer *bufp; +{ + /* We fetch characters from PATTERN here. Even though PATTERN is + `char *' (i.e., signed), we declare these variables as unsigned, so + they can be reliably used as array indices. */ + register unsigned char c, c1; + + /* A random tempory spot in PATTERN. */ + const char *p1; + + /* Points to the end of the buffer, where we should append. */ + register unsigned char *b; + + /* Keeps track of unclosed groups. */ + compile_stack_type compile_stack; + + /* Points to the current (ending) position in the pattern. */ + const char *p = pattern; + const char *pend = pattern + size; + + /* How to translate the characters in the pattern. */ + char *translate = bufp->translate; + + /* Address of the count-byte of the most recently inserted `exactn' + command. This makes it possible to tell if a new exact-match + character can be added to that command or if the character requires + a new `exactn' command. */ + unsigned char *pending_exact = 0; + + /* Address of start of the most recently finished expression. + This tells, e.g., postfix * where to find the start of its + operand. Reset at the beginning of groups and alternatives. */ + unsigned char *laststart = 0; + + /* Address of beginning of regexp, or inside of last group. */ + unsigned char *begalt; + + /* Place in the uncompiled pattern (i.e., the {) to + which to go back if the interval is invalid. */ + const char *beg_interval; + + /* Address of the place where a forward jump should go to the end of + the containing expression. Each alternative of an `or' -- except the + last -- ends with a forward jump of this sort. */ + unsigned char *fixup_alt_jump = 0; + + /* Counts open-groups as they are encountered. Remembered for the + matching close-group on the compile stack, so the same register + number is put in the stop_memory as the start_memory. */ + regnum_t regnum = 0; + +#ifdef DEBUG + DEBUG_PRINT1 ("\nCompiling pattern: "); + if (debug) + { + unsigned debug_count; + + for (debug_count = 0; debug_count < size; debug_count++) + printchar (pattern[debug_count]); + putchar ('\n'); + } +#endif /* DEBUG */ + + /* Initialize the compile stack. */ + compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size = INIT_COMPILE_STACK_SIZE; + compile_stack.avail = 0; + + /* Initialize the pattern buffer. */ + bufp->syntax = syntax; + bufp->fastmap_accurate = 0; + bufp->not_bol = bufp->not_eol = 0; + + /* Set `used' to zero, so that if we return an error, the pattern + printer (for debugging) will think there's no pattern. We reset it + at the end. */ + bufp->used = 0; + + /* Always count groups, whether or not bufp->no_sub is set. */ + bufp->re_nsub = 0; + +#if !defined (emacs) && !defined (SYNTAX_TABLE) + /* Initialize the syntax table. */ + init_syntax_once (); +#endif + + if (bufp->allocated == 0) + { + if (bufp->buffer) + { /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. */ + RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); + } + else + { /* Caller did not allocate a buffer. Do it for them. */ + bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); + } + if (!bufp->buffer) return REG_ESPACE; + + bufp->allocated = INIT_BUF_SIZE; + } + + begalt = b = bufp->buffer; + + /* Loop through the uncompiled pattern until we're at the end. */ + while (p != pend) + { + PATFETCH (c); + + switch (c) + { + case '^': + { + if ( /* If at start of pattern, it's an operator. */ + p == pattern + 1 + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's come before. */ + || at_begline_loc_p (pattern, p, syntax)) + BUF_PUSH (begline); + else + goto normal_char; + } + break; + + + case '$': + { + if ( /* If at end of pattern, it's an operator. */ + p == pend + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's next. */ + || at_endline_loc_p (p, pend, syntax)) + BUF_PUSH (endline); + else + goto normal_char; + } + break; + + + case '+': + case '?': + if ((syntax & RE_BK_PLUS_QM) + || (syntax & RE_LIMITED_OPS)) + goto normal_char; + handle_plus: + case '*': + /* If there is no previous pattern... */ + if (!laststart) + { + if (syntax & RE_CONTEXT_INVALID_OPS) + return REG_BADRPT; + else if (!(syntax & RE_CONTEXT_INDEP_OPS)) + goto normal_char; + } + + { + /* Are we optimizing this jump? */ + boolean keep_string_p = false; + + /* 1 means zero (many) matches is allowed. */ + char zero_times_ok = 0, many_times_ok = 0; + + /* If there is a sequence of repetition chars, collapse it + down to just one (the right one). We can't combine + interval operators with these because of, e.g., `a{2}*', + which should only match an even number of `a's. */ + + for (;;) + { + zero_times_ok |= c != '+'; + many_times_ok |= c != '?'; + + if (p == pend) + break; + + PATFETCH (c); + + if (c == '*' + || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) + ; + + else if (syntax & RE_BK_PLUS_QM && c == '\\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + if (!(c1 == '+' || c1 == '?')) + { + PATUNFETCH; + PATUNFETCH; + break; + } + + c = c1; + } + else + { + PATUNFETCH; + break; + } + + /* If we get here, we found another repeat character. */ + } + + /* Star, etc. applied to an empty pattern is equivalent + to an empty pattern. */ + if (!laststart) + break; + + /* Now we know whether or not zero matches is allowed + and also whether or not two or more matches is allowed. */ + if (many_times_ok) + { /* More than one repetition is allowed, so put in at the + end a backward relative jump from `b' to before the next + jump we're going to put in below (which jumps from + laststart to after this jump). + + But if we are at the `*' in the exact sequence `.*\n', + insert an unconditional jump backwards to the ., + instead of the beginning of the loop. This way we only + push a failure point once, instead of every time + through the loop. */ + assert (p - 1 > pattern); + + /* Allocate the space for the jump. */ + GET_BUFFER_SPACE (3); + + /* We know we are not at the first character of the pattern, + because laststart was nonzero. And we've already + incremented `p', by the way, to be the character after + the `*'. Do we have to do something analogous here + for null bytes, because of RE_DOT_NOT_NULL? */ + if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') + && zero_times_ok + && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') + && !(syntax & RE_DOT_NEWLINE)) + { /* We have .*\n. */ + STORE_JUMP (jump, b, laststart); + keep_string_p = true; + } + else + /* Anything else. */ + STORE_JUMP (maybe_pop_jump, b, laststart - 3); + + /* We've added more stuff to the buffer. */ + b += 3; + } + + /* On failure, jump from laststart to b + 3, which will be the + end of the buffer after this jump is inserted. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump + : on_failure_jump, + laststart, b + 3); + pending_exact = 0; + b += 3; + + if (!zero_times_ok) + { + /* At least one repetition is required, so insert a + `dummy_failure_jump' before the initial + `on_failure_jump' instruction of the loop. This + effects a skip over that instruction the first time + we hit that loop. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); + b += 3; + } + } + break; + + + case '.': + laststart = b; + BUF_PUSH (anychar); + break; + + + case '[': + { + boolean had_char_class = false; + + if (p == pend) return REG_EBRACK; + + /* Ensure that we have enough space to push a charset: the + opcode, the length count, and the bitset; 34 bytes in all. */ + GET_BUFFER_SPACE (34); + + laststart = b; + + /* We test `*p == '^' twice, instead of using an if + statement, so we only need one BUF_PUSH. */ + BUF_PUSH (*p == '^' ? charset_not : charset); + if (*p == '^') + p++; + + /* Remember the first position in the bracket expression. */ + p1 = p; + + /* Push the number of bytes in the bitmap. */ + BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); + + /* Clear the whole map. */ + bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-2] == charset_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) + SET_LIST_BIT ('\n'); + + /* Read in characters and ranges, setting map bits. */ + for (;;) + { + if (p == pend) return REG_EBRACK; + + PATFETCH (c); + + /* \ might escape characters inside [...] and [^...]. */ + if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + SET_LIST_BIT (c1); + continue; + } + + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; + + /* Look ahead to see if it's a range when the last thing + was a character class. */ + if (had_char_class && c == '-' && *p != ']') + return REG_ERANGE; + + /* Look ahead to see if it's a range when the last thing + was a character: if this is a hyphen not at the + beginning or the end of a list, then it's the range + operator. */ + if (c == '-' + && !(p - 2 >= pattern && p[-2] == '[') + && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') + && *p != ']') + { + reg_errcode_t ret + = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + else if (p[0] == '-' && p[1] != ']') + { /* This handles ranges made up of characters only. */ + reg_errcode_t ret; + + /* Move past the `-'. */ + PATFETCH (c1); + + ret = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + /* See if we're at the beginning of a possible character + class. */ + + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') + { /* Leave room for the null. */ + char str[CHAR_CLASS_MAX_LENGTH + 1]; + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[:'. */ + if (p == pend) return REG_EBRACK; + + for (;;) + { + PATFETCH (c); + if (c == ':' || c == ']' || p == pend + || c1 == CHAR_CLASS_MAX_LENGTH) + break; + str[c1++] = c; + } + str[c1] = '\0'; + + /* If isn't a word bracketed by `[:' and:`]': + undo the ending character, the letters, and leave + the leading `:' and `[' (but set bits for them). */ + if (c == ':' && *p == ']') + { + int ch; + boolean is_alnum = STREQ (str, "alnum"); + boolean is_alpha = STREQ (str, "alpha"); + boolean is_blank = STREQ (str, "blank"); + boolean is_cntrl = STREQ (str, "cntrl"); + boolean is_digit = STREQ (str, "digit"); + boolean is_graph = STREQ (str, "graph"); + boolean is_lower = STREQ (str, "lower"); + boolean is_print = STREQ (str, "print"); + boolean is_punct = STREQ (str, "punct"); + boolean is_space = STREQ (str, "space"); + boolean is_upper = STREQ (str, "upper"); + boolean is_xdigit = STREQ (str, "xdigit"); + + if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; + + /* Throw away the ] at the end of the character + class. */ + PATFETCH (c); + + if (p == pend) return REG_EBRACK; + + for (ch = 0; ch < 1 << BYTEWIDTH; ch++) + { + if ( (is_alnum && ISALNUM (ch)) + || (is_alpha && ISALPHA (ch)) + || (is_blank && ISBLANK (ch)) + || (is_cntrl && ISCNTRL (ch)) + || (is_digit && ISDIGIT (ch)) + || (is_graph && ISGRAPH (ch)) + || (is_lower && ISLOWER (ch)) + || (is_print && ISPRINT (ch)) + || (is_punct && ISPUNCT (ch)) + || (is_space && ISSPACE (ch)) + || (is_upper && ISUPPER (ch)) + || (is_xdigit && ISXDIGIT (ch))) + SET_LIST_BIT (ch); + } + had_char_class = true; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT (':'); + had_char_class = false; + } + } + else + { + had_char_class = false; + SET_LIST_BIT (c); + } + } + + /* Discard any (non)matching list bytes that are all 0 at the + end of the map. Decrease the map-length byte too. */ + while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) + b[-1]--; + b += b[-1]; + } + break; + + + case '(': + if (syntax & RE_NO_BK_PARENS) + goto handle_open; + else + goto normal_char; + + + case ')': + if (syntax & RE_NO_BK_PARENS) + goto handle_close; + else + goto normal_char; + + + case '\n': + if (syntax & RE_NEWLINE_ALT) + goto handle_alt; + else + goto normal_char; + + + case '|': + if (syntax & RE_NO_BK_VBAR) + goto handle_alt; + else + goto normal_char; + + + case '{': + if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) + goto handle_interval; + else + goto normal_char; + + + case '\\': + if (p == pend) return REG_EESCAPE; + + /* Do not translate the character after the \, so that we can + distinguish, e.g., \B from \b, even if we normally would + translate, e.g., B to b. */ + PATFETCH_RAW (c); + + switch (c) + { + case '(': + if (syntax & RE_NO_BK_PARENS) + goto normal_backslash; + + handle_open: + bufp->re_nsub++; + regnum++; + + if (COMPILE_STACK_FULL) + { + RETALLOC (compile_stack.stack, compile_stack.size << 1, + compile_stack_elt_t); + if (compile_stack.stack == NULL) return REG_ESPACE; + + compile_stack.size <<= 1; + } + + /* These are the values to restore when we hit end of this + group. They are all relative offsets, so that if the + whole pattern moves because of realloc, they will still + be valid. */ + COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; + COMPILE_STACK_TOP.fixup_alt_jump + = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; + COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; + COMPILE_STACK_TOP.regnum = regnum; + + /* We will eventually replace the 0 with the number of + groups inner to this one. But do not push a + start_memory for groups beyond the last one we can + represent in the compiled pattern. */ + if (regnum <= MAX_REGNUM) + { + COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; + BUF_PUSH_3 (start_memory, regnum, 0); + } + + compile_stack.avail++; + + fixup_alt_jump = 0; + laststart = 0; + begalt = b; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + break; + + + case ')': + if (syntax & RE_NO_BK_PARENS) goto normal_backslash; + + if (COMPILE_STACK_EMPTY) + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_backslash; + else + return REG_ERPAREN; + + handle_close: + if (fixup_alt_jump) + { /* Push a dummy failure point at the end of the + alternative for a possible future + `pop_failure_jump' to pop. See comments at + `push_dummy_failure' in `re_match_2'. */ + BUF_PUSH (push_dummy_failure); + + /* We allocated space for this jump when we assigned + to `fixup_alt_jump', in the `handle_alt' case below. */ + STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); + } + + /* See similar code for backslashed left paren above. */ + if (COMPILE_STACK_EMPTY) + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_char; + else + return REG_ERPAREN; + + /* Since we just checked for an empty stack above, this + ``can't happen''. */ + assert (compile_stack.avail != 0); + { + /* We don't just want to restore into `regnum', because + later groups should continue to be numbered higher, + as in `(ab)c(de)' -- the second group is #2. */ + regnum_t this_group_regnum; + + compile_stack.avail--; + begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; + fixup_alt_jump + = COMPILE_STACK_TOP.fixup_alt_jump + ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 + : 0; + laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; + this_group_regnum = COMPILE_STACK_TOP.regnum; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + + /* We're at the end of the group, so now we know how many + groups were inside this one. */ + if (this_group_regnum <= MAX_REGNUM) + { + unsigned char *inner_group_loc + = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; + + *inner_group_loc = regnum - this_group_regnum; + BUF_PUSH_3 (stop_memory, this_group_regnum, + regnum - this_group_regnum); + } + } + break; + + + case '|': /* `\|'. */ + if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) + goto normal_backslash; + handle_alt: + if (syntax & RE_LIMITED_OPS) + goto normal_char; + + /* Insert before the previous alternative a jump which + jumps to this alternative if the former fails. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (on_failure_jump, begalt, b + 6); + pending_exact = 0; + b += 3; + + /* The alternative before this one has a jump after it + which gets executed if it gets matched. Adjust that + jump so it will jump to this alternative's analogous + jump (put in below, which in turn will jump to the next + (if any) alternative's such jump, etc.). The last such + jump jumps to the correct final destination. A picture: + _____ _____ + | | | | + | v | v + a | b | c + + If we are at `b', then fixup_alt_jump right now points to a + three-byte space after `a'. We'll put in the jump, set + fixup_alt_jump to right after `b', and leave behind three + bytes which we'll fill in when we get to after `c'. */ + + if (fixup_alt_jump) + STORE_JUMP (jump_past_alt, fixup_alt_jump, b); + + /* Mark and leave space for a jump after this alternative, + to be filled in later either by next alternative or + when know we're at the end of a series of alternatives. */ + fixup_alt_jump = b; + GET_BUFFER_SPACE (3); + b += 3; + + laststart = 0; + begalt = b; + break; + + + case '{': + /* If \{ is a literal. */ + if (!(syntax & RE_INTERVALS) + /* If we're at `\{' and it's not the open-interval + operator. */ + || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) + || (p - 2 == pattern && p == pend)) + goto normal_backslash; + + handle_interval: + { + /* If got here, then the syntax allows intervals. */ + + /* At least (most) this many matches must be made. */ + int lower_bound = -1, upper_bound = -1; + + beg_interval = p - 1; + + if (p == pend) + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_EBRACE; + } + + GET_UNSIGNED_NUMBER (lower_bound); + + if (c == ',') + { + GET_UNSIGNED_NUMBER (upper_bound); + if (upper_bound < 0) upper_bound = RE_DUP_MAX; + } + else + /* Interval such as `{1}' => match exactly once. */ + upper_bound = lower_bound; + + if (lower_bound < 0 || upper_bound > RE_DUP_MAX + || lower_bound > upper_bound) + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_BADBR; + } + + if (!(syntax & RE_NO_BK_BRACES)) + { + if (c != '\\') return REG_EBRACE; + + PATFETCH (c); + } + + if (c != '}') + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_BADBR; + } + + /* We just parsed a valid interval. */ + + /* If it's invalid to have no preceding re. */ + if (!laststart) + { + if (syntax & RE_CONTEXT_INVALID_OPS) + return REG_BADRPT; + else if (syntax & RE_CONTEXT_INDEP_OPS) + laststart = b; + else + goto unfetch_interval; + } + + /* If the upper bound is zero, don't want to succeed at + all; jump from `laststart' to `b + 3', which will be + the end of the buffer after we insert the jump. */ + if (upper_bound == 0) + { + GET_BUFFER_SPACE (3); + INSERT_JUMP (jump, laststart, b + 3); + b += 3; + } + + /* Otherwise, we have a nontrivial interval. When + we're all done, the pattern will look like: + set_number_at + set_number_at + succeed_n + + jump_n + (The upper bound and `jump_n' are omitted if + `upper_bound' is 1, though.) */ + else + { /* If the upper bound is > 1, we need to insert + more at the end of the loop. */ + unsigned nbytes = 10 + (upper_bound > 1) * 10; + + GET_BUFFER_SPACE (nbytes); + + /* Initialize lower bound of the `succeed_n', even + though it will be set during matching by its + attendant `set_number_at' (inserted next), + because `re_compile_fastmap' needs to know. + Jump to the `jump_n' we might insert below. */ + INSERT_JUMP2 (succeed_n, laststart, + b + 5 + (upper_bound > 1) * 5, + lower_bound); + b += 5; + + /* Code to initialize the lower bound. Insert + before the `succeed_n'. The `5' is the last two + bytes of this `set_number_at', plus 3 bytes of + the following `succeed_n'. */ + insert_op2 (set_number_at, laststart, 5, lower_bound, b); + b += 5; + + if (upper_bound > 1) + { /* More than one repetition is allowed, so + append a backward jump to the `succeed_n' + that starts this interval. + + When we've reached this during matching, + we'll have matched the interval once, so + jump back only `upper_bound - 1' times. */ + STORE_JUMP2 (jump_n, b, laststart + 5, + upper_bound - 1); + b += 5; + + /* The location we want to set is the second + parameter of the `jump_n'; that is `b-2' as + an absolute address. `laststart' will be + the `set_number_at' we're about to insert; + `laststart+3' the number to set, the source + for the relative address. But we are + inserting into the middle of the pattern -- + so everything is getting moved up by 5. + Conclusion: (b - 2) - (laststart + 3) + 5, + i.e., b - laststart. + + We insert this at the beginning of the loop + so that if we fail during matching, we'll + reinitialize the bounds. */ + insert_op2 (set_number_at, laststart, b - laststart, + upper_bound - 1, b); + b += 5; + } + } + pending_exact = 0; + beg_interval = NULL; + } + break; + + unfetch_interval: + /* If an invalid interval, match the characters as literals. */ + assert (beg_interval); + p = beg_interval; + beg_interval = NULL; + + /* normal_char and normal_backslash need `c'. */ + PATFETCH (c); + + if (!(syntax & RE_NO_BK_BRACES)) + { + if (p > pattern && p[-1] == '\\') + goto normal_backslash; + } + goto normal_char; + +#ifdef emacs + /* There is no way to specify the before_dot and after_dot + operators. rms says this is ok. --karl */ + case '=': + BUF_PUSH (at_dot); + break; + + case 's': + laststart = b; + PATFETCH (c); + BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); + break; + + case 'S': + laststart = b; + PATFETCH (c); + BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); + break; +#endif /* emacs */ + + + case 'w': + laststart = b; + BUF_PUSH (wordchar); + break; + + + case 'W': + laststart = b; + BUF_PUSH (notwordchar); + break; + + + case '<': + BUF_PUSH (wordbeg); + break; + + case '>': + BUF_PUSH (wordend); + break; + + case 'b': + BUF_PUSH (wordbound); + break; + + case 'B': + BUF_PUSH (notwordbound); + break; + + case '`': + BUF_PUSH (begbuf); + break; + + case '\'': + BUF_PUSH (endbuf); + break; + + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + if (syntax & RE_NO_BK_REFS) + goto normal_char; + + c1 = c - '0'; + + if (c1 > regnum) + return REG_ESUBREG; + + /* Can't back reference to a subexpression if inside of it. */ + if (group_in_compile_stack (compile_stack, c1)) + goto normal_char; + + laststart = b; + BUF_PUSH_2 (duplicate, c1); + break; + + + case '+': + case '?': + if (syntax & RE_BK_PLUS_QM) + goto handle_plus; + else + goto normal_backslash; + + default: + normal_backslash: + /* You might think it would be useful for \ to mean + not to translate; but if we don't translate it + it will never match anything. */ + c = TRANSLATE (c); + goto normal_char; + } + break; + + + default: + /* Expects the character in `c'. */ + normal_char: + /* If no exactn currently being built. */ + if (!pending_exact + + /* If last exactn not at current position. */ + || pending_exact + *pending_exact + 1 != b + + /* We have only one byte following the exactn for the count. */ + || *pending_exact == (1 << BYTEWIDTH) - 1 + + /* If followed by a repetition operator. */ + || *p == '*' || *p == '^' + || ((syntax & RE_BK_PLUS_QM) + ? *p == '\\' && (p[1] == '+' || p[1] == '?') + : (*p == '+' || *p == '?')) + || ((syntax & RE_INTERVALS) + && ((syntax & RE_NO_BK_BRACES) + ? *p == '{' + : (p[0] == '\\' && p[1] == '{')))) + { + /* Start building a new exactn. */ + + laststart = b; + + BUF_PUSH_2 (exactn, 0); + pending_exact = b - 1; + } + + BUF_PUSH (c); + (*pending_exact)++; + break; + } /* switch (c) */ + } /* while p != pend */ + + + /* Through the pattern now. */ + + if (fixup_alt_jump) + STORE_JUMP (jump_past_alt, fixup_alt_jump, b); + + if (!COMPILE_STACK_EMPTY) + return REG_EPAREN; + + free (compile_stack.stack); + + /* We have succeeded; set the length of the buffer. */ + bufp->used = b - bufp->buffer; + +#ifdef DEBUG + if (debug) + { + DEBUG_PRINT1 ("\nCompiled pattern: "); + print_compiled_pattern (bufp); + } +#endif /* DEBUG */ + + return REG_NOERROR; +} /* regex_compile */ + +/* Subroutines for `regex_compile'. */ + +/* Store OP at LOC followed by two-byte integer parameter ARG. */ + +static void +store_op1 (op, loc, arg) + re_opcode_t op; + unsigned char *loc; + int arg; +{ + *loc = (unsigned char) op; + STORE_NUMBER (loc + 1, arg); +} + + +/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ + +static void +store_op2 (op, loc, arg1, arg2) + re_opcode_t op; + unsigned char *loc; + int arg1, arg2; +{ + *loc = (unsigned char) op; + STORE_NUMBER (loc + 1, arg1); + STORE_NUMBER (loc + 3, arg2); +} + + +/* Copy the bytes from LOC to END to open up three bytes of space at LOC + for OP followed by two-byte integer parameter ARG. */ + +static void +insert_op1 (op, loc, arg, end) + re_opcode_t op; + unsigned char *loc; + int arg; + unsigned char *end; +{ + register unsigned char *pfrom = end; + register unsigned char *pto = end + 3; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op1 (op, loc, arg); +} + + +/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ + +static void +insert_op2 (op, loc, arg1, arg2, end) + re_opcode_t op; + unsigned char *loc; + int arg1, arg2; + unsigned char *end; +{ + register unsigned char *pfrom = end; + register unsigned char *pto = end + 5; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op2 (op, loc, arg1, arg2); +} + + +/* P points to just after a ^ in PATTERN. Return true if that ^ comes + after an alternative or a begin-subexpression. We assume there is at + least one character before the ^. */ + +static boolean +at_begline_loc_p (pattern, p, syntax) + const char *pattern, *p; + reg_syntax_t syntax; +{ + const char *prev = p - 2; + boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; + + return + /* After a subexpression? */ + (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) + /* After an alternative? */ + || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); +} + + +/* The dual of at_begline_loc_p. This one is for $. We assume there is + at least one character after the $, i.e., `P < PEND'. */ + +static boolean +at_endline_loc_p (p, pend, syntax) + const char *p, *pend; + int syntax; +{ + const char *next = p; + boolean next_backslash = *next == '\\'; + const char *next_next = p + 1 < pend ? p + 1 : NULL; + + return + /* Before a subexpression? */ + (syntax & RE_NO_BK_PARENS ? *next == ')' + : next_backslash && next_next && *next_next == ')') + /* Before an alternative? */ + || (syntax & RE_NO_BK_VBAR ? *next == '|' + : next_backslash && next_next && *next_next == '|'); +} + + +/* Returns true if REGNUM is in one of COMPILE_STACK's elements and + false if it's not. */ + +static boolean +group_in_compile_stack (compile_stack, regnum) + compile_stack_type compile_stack; + regnum_t regnum; +{ + int this_element; + + for (this_element = compile_stack.avail - 1; + this_element >= 0; + this_element--) + if (compile_stack.stack[this_element].regnum == regnum) + return true; + + return false; +} + + +/* Read the ending character of a range (in a bracket expression) from the + uncompiled pattern *P_PTR (which ends at PEND). We assume the + starting character is in `P[-2]'. (`P[-1]' is the character `-'.) + Then we set the translation of all bits between the starting and + ending characters (inclusive) in the compiled pattern B. + + Return an error code. + + We use these short variable names so we can use the same macros as + `regex_compile' itself. */ + +static reg_errcode_t +compile_range (p_ptr, pend, translate, syntax, b) + const char **p_ptr, *pend; + char *translate; + reg_syntax_t syntax; + unsigned char *b; +{ + unsigned this_char; + + const char *p = *p_ptr; + int range_start, range_end; + + if (p == pend) + return REG_ERANGE; + + /* Even though the pattern is a signed `char *', we need to fetch + with unsigned char *'s; if the high bit of the pattern character + is set, the range endpoints will be negative if we fetch using a + signed char *. + + We also want to fetch the endpoints without translating them; the + appropriate translation is done in the bit-setting loop below. */ + range_start = ((unsigned char *) p)[-2]; + range_end = ((unsigned char *) p)[0]; + + /* Have to increment the pointer into the pattern string, so the + caller isn't still at the ending character. */ + (*p_ptr)++; + + /* If the start is after the end, the range is empty. */ + if (range_start > range_end) + return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; + + /* Here we see why `this_char' has to be larger than an `unsigned + char' -- the range is inclusive, so if `range_end' == 0xff + (assuming 8-bit characters), we would otherwise go into an infinite + loop, since all characters <= 0xff. */ + for (this_char = range_start; this_char <= range_end; this_char++) + { + SET_LIST_BIT (TRANSLATE (this_char)); + } + + return REG_NOERROR; +} + +/* Failure stack declarations and macros; both re_compile_fastmap and + re_match_2 use a failure stack. These have to be macros because of + REGEX_ALLOCATE. */ + + +/* Number of failure points for which to initially allocate space + when matching. If this number is exceeded, we allocate more + space, so it is not a hard limit. */ +#ifndef INIT_FAILURE_ALLOC +#define INIT_FAILURE_ALLOC 5 +#endif + +/* Roughly the maximum number of failure points on the stack. Would be + exactly that if always used MAX_FAILURE_SPACE each time we failed. + This is a variable only so users of regex can assign to it; we never + change it ourselves. */ +int re_max_failures = 2000; + +typedef const unsigned char *fail_stack_elt_t; + +typedef struct +{ + fail_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} fail_stack_type; + +#define FAIL_STACK_EMPTY() (fail_stack.avail == 0) +#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) +#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) +#define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) + + +/* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ + +#define INIT_FAIL_STACK() \ + do { \ + fail_stack.stack = (fail_stack_elt_t *) \ + REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ + \ + if (fail_stack.stack == NULL) \ + return -2; \ + \ + fail_stack.size = INIT_FAILURE_ALLOC; \ + fail_stack.avail = 0; \ + } while (0) + + +/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. + + Return 1 if succeeds, and 0 if either ran out of memory + allocating space for it or it was already too large. + + REGEX_REALLOCATE requires `destination' be declared. */ + +#define DOUBLE_FAIL_STACK(fail_stack) \ + ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ + ? 0 \ + : ((fail_stack).stack = (fail_stack_elt_t *) \ + REGEX_REALLOCATE ((fail_stack).stack, \ + (fail_stack).size * sizeof (fail_stack_elt_t), \ + ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ + \ + (fail_stack).stack == NULL \ + ? 0 \ + : ((fail_stack).size <<= 1, \ + 1))) + + +/* Push PATTERN_OP on FAIL_STACK. + + Return 1 if was able to do so and 0 if ran out of memory allocating + space to do so. */ +#define PUSH_PATTERN_OP(pattern_op, fail_stack) \ + ((FAIL_STACK_FULL () \ + && !DOUBLE_FAIL_STACK (fail_stack)) \ + ? 0 \ + : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \ + 1)) + +/* This pushes an item onto the failure stack. Must be a four-byte + value. Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ +#define PUSH_FAILURE_ITEM(item) \ + fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item + +/* The complement operation. Assumes `fail_stack' is nonempty. */ +#define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail] + +/* Used to omit pushing failure point id's when we're not debugging. */ +#ifdef DEBUG +#define DEBUG_PUSH PUSH_FAILURE_ITEM +#define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM () +#else +#define DEBUG_PUSH(item) +#define DEBUG_POP(item_addr) +#endif + + +/* Push the information about the state we will need + if we ever fail back to it. + + Requires variables fail_stack, regstart, regend, reg_info, and + num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be + declared. + + Does `return FAILURE_CODE' if runs out of memory. */ + +#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ + do { \ + char *destination; \ + /* Must be int, so when we don't save any registers, the arithmetic \ + of 0 + -1 isn't done as unsigned. */ \ + int this_reg; \ + \ + DEBUG_STATEMENT (failure_id++); \ + DEBUG_STATEMENT (nfailure_points_pushed++); \ + DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ + DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ + DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ + \ + DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ + DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ + \ + /* Ensure we have enough space allocated for what we will push. */ \ + while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ + { \ + if (!DOUBLE_FAIL_STACK (fail_stack)) \ + return failure_code; \ + \ + DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ + (fail_stack).size); \ + DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ + } \ + \ + /* Push the info, starting with the registers. */ \ + DEBUG_PRINT1 ("\n"); \ + \ + for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ + this_reg++) \ + { \ + DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ + DEBUG_STATEMENT (num_regs_pushed++); \ + \ + DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ + PUSH_FAILURE_ITEM (regstart[this_reg]); \ + \ + DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ + PUSH_FAILURE_ITEM (regend[this_reg]); \ + \ + DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ + DEBUG_PRINT2 (" match_null=%d", \ + REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ + DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ + DEBUG_PRINT2 (" matched_something=%d", \ + MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT2 (" ever_matched=%d", \ + EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT1 ("\n"); \ + PUSH_FAILURE_ITEM (reg_info[this_reg].word); \ + } \ + \ + DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ + PUSH_FAILURE_ITEM (lowest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ + PUSH_FAILURE_ITEM (highest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ + PUSH_FAILURE_ITEM (pattern_place); \ + \ + DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ + DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ + size2); \ + DEBUG_PRINT1 ("'\n"); \ + PUSH_FAILURE_ITEM (string_place); \ + \ + DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ + DEBUG_PUSH (failure_id); \ + } while (0) + +/* This is the number of items that are pushed and popped on the stack + for each register. */ +#define NUM_REG_ITEMS 3 + +/* Individual items aside from the registers. */ +#ifdef DEBUG +#define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ +#else +#define NUM_NONREG_ITEMS 4 +#endif + +/* We push at most this many items on the stack. */ +#define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) + +/* We actually push this many items. */ +#define NUM_FAILURE_ITEMS \ + ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ + + NUM_NONREG_ITEMS) + +/* How many items can still be added to the stack without overflowing it. */ +#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) + + +/* Pops what PUSH_FAIL_STACK pushes. + + We restore into the parameters, all of which should be lvalues: + STR -- the saved data position. + PAT -- the saved pattern position. + LOW_REG, HIGH_REG -- the highest and lowest active registers. + REGSTART, REGEND -- arrays of string positions. + REG_INFO -- array of information about each subexpression. + + Also assumes the variables `fail_stack' and (if debugging), `bufp', + `pend', `string1', `size1', `string2', and `size2'. */ + +#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ +{ \ + DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ + int this_reg; \ + const unsigned char *string_temp; \ + \ + assert (!FAIL_STACK_EMPTY ()); \ + \ + /* Remove failure points and point to how many regs pushed. */ \ + DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ + DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ + DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ + \ + assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ + \ + DEBUG_POP (&failure_id); \ + DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ + \ + /* If the saved string location is NULL, it came from an \ + on_failure_keep_string_jump opcode, and we want to throw away the \ + saved NULL, thus retaining our current position in the string. */ \ + string_temp = POP_FAILURE_ITEM (); \ + if (string_temp != NULL) \ + str = (const char *) string_temp; \ + \ + DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ + DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ + DEBUG_PRINT1 ("'\n"); \ + \ + pat = (unsigned char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ + \ + /* Restore register info. */ \ + high_reg = (unsigned) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ + \ + low_reg = (unsigned) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ + \ + for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ + { \ + DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ + \ + reg_info[this_reg].word = POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ + \ + regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ + \ + regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ + } \ + \ + DEBUG_STATEMENT (nfailure_points_popped++); \ +} /* POP_FAILURE_POINT */ + +/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in + BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible + characters can start a string that matches the pattern. This fastmap + is used by re_search to skip quickly over impossible starting points. + + The caller must supply the address of a (1 << BYTEWIDTH)-byte data + area as BUFP->fastmap. + + We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in + the pattern buffer. + + Returns 0 if we succeed, -2 if an internal error. */ + +int +re_compile_fastmap (bufp) + struct re_pattern_buffer *bufp; +{ + int j, k; + fail_stack_type fail_stack; +#ifndef REGEX_MALLOC + char *destination; +#endif + /* We don't push any register information onto the failure stack. */ + unsigned num_regs = 0; + + register char *fastmap = bufp->fastmap; + unsigned char *pattern = bufp->buffer; + unsigned long size = bufp->used; + const unsigned char *p = pattern; + register unsigned char *pend = pattern + size; + + /* Assume that each path through the pattern can be null until + proven otherwise. We set this false at the bottom of switch + statement, to which we get only if a particular path doesn't + match the empty string. */ + boolean path_can_be_null = true; + + /* We aren't doing a `succeed_n' to begin with. */ + boolean succeed_n_p = false; + + assert (fastmap != NULL && p != NULL); + + INIT_FAIL_STACK (); + bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ + bufp->fastmap_accurate = 1; /* It will be when we're done. */ + bufp->can_be_null = 0; + + while (p != pend || !FAIL_STACK_EMPTY ()) + { + if (p == pend) + { + bufp->can_be_null |= path_can_be_null; + + /* Reset for next path. */ + path_can_be_null = true; + + p = fail_stack.stack[--fail_stack.avail]; + } + + /* We should never be about to go beyond the end of the pattern. */ + assert (p < pend); + +#ifdef SWITCH_ENUM_BUG + switch ((int) ((re_opcode_t) *p++)) +#else + switch ((re_opcode_t) *p++) +#endif + { + + /* I guess the idea here is to simply not bother with a fastmap + if a backreference is used, since it's too hard to figure out + the fastmap for the corresponding group. Setting + `can_be_null' stops `re_search_2' from using the fastmap, so + that is all we do. */ + case duplicate: + bufp->can_be_null = 1; + return 0; + + + /* Following are the cases which match a character. These end + with `break'. */ + + case exactn: + fastmap[p[1]] = 1; + break; + + + case charset: + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) + fastmap[j] = 1; + break; + + + case charset_not: + /* Chars beyond end of map must be allowed. */ + for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) + fastmap[j] = 1; + break; + + + case wordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) == Sword) + fastmap[j] = 1; + break; + + + case notwordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) != Sword) + fastmap[j] = 1; + break; + + + case anychar: + /* `.' matches anything ... */ + for (j = 0; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + /* ... except perhaps newline. */ + if (!(bufp->syntax & RE_DOT_NEWLINE)) + fastmap['\n'] = 0; + + /* Return if we have already set `can_be_null'; if we have, + then the fastmap is irrelevant. Something's wrong here. */ + else if (bufp->can_be_null) + return 0; + + /* Otherwise, have to check alternative paths. */ + break; + + +#ifdef emacs + case syntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) == (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + case notsyntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) != (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + /* All cases after this match the empty string. These end with + `continue'. */ + + + case before_dot: + case at_dot: + case after_dot: + continue; +#endif /* not emacs */ + + + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbound: + case notwordbound: + case wordbeg: + case wordend: + case push_dummy_failure: + continue; + + + case jump_n: + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case jump_past_alt: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR (j, p); + p += j; + if (j > 0) + continue; + + /* Jump backward implies we just went through the body of a + loop and matched nothing. Opcode jumped to should be + `on_failure_jump' or `succeed_n'. Just treat it like an + ordinary jump. For a * loop, it has pushed its failure + point already; if so, discard that as redundant. */ + if ((re_opcode_t) *p != on_failure_jump + && (re_opcode_t) *p != succeed_n) + continue; + + p++; + EXTRACT_NUMBER_AND_INCR (j, p); + p += j; + + /* If what's on the stack is where we are now, pop it. */ + if (!FAIL_STACK_EMPTY () + && fail_stack.stack[fail_stack.avail - 1] == p) + fail_stack.avail--; + + continue; + + + case on_failure_jump: + case on_failure_keep_string_jump: + handle_on_failure_jump: + EXTRACT_NUMBER_AND_INCR (j, p); + + /* For some patterns, e.g., `(a?)?', `p+j' here points to the + end of the pattern. We don't want to push such a point, + since when we restore it above, entering the switch will + increment `p' past the end of the pattern. We don't need + to push such a point since we obviously won't find any more + fastmap entries beyond `pend'. Such a pattern can match + the null string, though. */ + if (p + j < pend) + { + if (!PUSH_PATTERN_OP (p + j, fail_stack)) + return -2; + } + else + bufp->can_be_null = 1; + + if (succeed_n_p) + { + EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ + succeed_n_p = false; + } + + continue; + + + case succeed_n: + /* Get to the number of times to succeed. */ + p += 2; + + /* Increment p past the n for when k != 0. */ + EXTRACT_NUMBER_AND_INCR (k, p); + if (k == 0) + { + p -= 4; + succeed_n_p = true; /* Spaghetti code alert. */ + goto handle_on_failure_jump; + } + continue; + + + case set_number_at: + p += 4; + continue; + + + case start_memory: + case stop_memory: + p += 2; + continue; + + + default: + abort (); /* We have listed all the cases. */ + } /* switch *p++ */ + + /* Getting here means we have found the possible starting + characters for one path of the pattern -- and that the empty + string does not match. We need not follow this path further. + Instead, look at the next alternative (remembered on the + stack), or quit if no more. The test at the top of the loop + does these things. */ + path_can_be_null = false; + p = pend; + } /* while p */ + + /* Set `can_be_null' for the last path (also the first path, if the + pattern is empty). */ + bufp->can_be_null |= path_can_be_null; + return 0; +} /* re_compile_fastmap */ + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use + this memory for recording register information. STARTS and ENDS + must be allocated using the malloc library routine, and must each + be at least NUM_REGS * sizeof (regoff_t) bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ + +void +re_set_registers (bufp, regs, num_regs, starts, ends) + struct re_pattern_buffer *bufp; + struct re_registers *regs; + unsigned num_regs; + regoff_t *starts, *ends; +{ + if (num_regs) + { + bufp->regs_allocated = REGS_REALLOCATE; + regs->num_regs = num_regs; + regs->start = starts; + regs->end = ends; + } + else + { + bufp->regs_allocated = REGS_UNALLOCATED; + regs->num_regs = 0; + regs->start = regs->end = (regoff_t) 0; + } +} + +/* Searching routines. */ + +/* Like re_search_2, below, but only one string is specified, and + doesn't let you say where to stop matching. */ + +int +re_search (bufp, string, size, startpos, range, regs) + struct re_pattern_buffer *bufp; + const char *string; + int size, startpos, range; + struct re_registers *regs; +{ + return re_search_2 (bufp, NULL, 0, string, size, startpos, range, + regs, size); +} + + +/* Using the compiled pattern in BUFP->buffer, first tries to match the + virtual concatenation of STRING1 and STRING2, starting first at index + STARTPOS, then at STARTPOS + 1, and so on. + + STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. + + RANGE is how far to scan while trying to match. RANGE = 0 means try + only at STARTPOS; in general, the last start tried is STARTPOS + + RANGE. + + In REGS, return the indices of the virtual concatenation of STRING1 + and STRING2 that matched the entire BUFP->buffer and its contained + subexpressions. + + Do not consider matching one past the index STOP in the virtual + concatenation of STRING1 and STRING2. + + We return either the position in the strings at which the match was + found, -1 if no match, or -2 if error (such as failure + stack overflow). */ + +int +re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int size1, size2; + int startpos; + int range; + struct re_registers *regs; + int stop; +{ + int val; + register char *fastmap = bufp->fastmap; + register char *translate = bufp->translate; + int total_size = size1 + size2; + int endpos = startpos + range; + + /* Check for out-of-range STARTPOS. */ + if (startpos < 0 || startpos > total_size) + return -1; + + /* Fix up RANGE if it might eventually take us outside + the virtual concatenation of STRING1 and STRING2. */ + if (endpos < -1) + range = -1 - startpos; + else if (endpos > total_size) + range = total_size - startpos; + + /* If the search isn't to be a backwards one, don't waste time in a + search for a pattern that must be anchored. */ + if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) + { + if (startpos > 0) + return -1; + else + range = 1; + } + + /* Update the fastmap now if not correct already. */ + if (fastmap && !bufp->fastmap_accurate) + if (re_compile_fastmap (bufp) == -2) + return -2; + + /* Loop through the string, looking for a place to start matching. */ + for (;;) + { + /* If a fastmap is supplied, skip quickly over characters that + cannot be the start of a match. If the pattern can match the + null string, however, we don't need to skip characters; we want + the first null string. */ + if (fastmap && startpos < total_size && !bufp->can_be_null) + { + if (range > 0) /* Searching forwards. */ + { + register const char *d; + register int lim = 0; + int irange = range; + + if (startpos < size1 && startpos + range >= size1) + lim = range - (size1 - startpos); + + d = (startpos >= size1 ? string2 - size1 : string1) + startpos; + + /* Written out as an if-else to avoid testing `translate' + inside the loop. */ + if (translate) + while (range > lim + && !fastmap[(unsigned char) + translate[(unsigned char) *d++]]) + range--; + else + while (range > lim && !fastmap[(unsigned char) *d++]) + range--; + + startpos += irange - range; + } + else /* Searching backwards. */ + { + register char c = (size1 == 0 || startpos >= size1 + ? string2[startpos - size1] + : string1[startpos]); + + if (!fastmap[(unsigned char) TRANSLATE (c)]) + goto advance; + } + } + + /* If can't match the null string, and that's all we have left, fail. */ + if (range >= 0 && startpos == total_size && fastmap + && !bufp->can_be_null) + return -1; + + val = re_match_2 (bufp, string1, size1, string2, size2, + startpos, regs, stop); + if (val >= 0) + return startpos; + + if (val == -2) + return -2; + + advance: + if (!range) + break; + else if (range > 0) + { + range--; + startpos++; + } + else + { + range++; + startpos--; + } + } + return -1; +} /* re_search_2 */ + +/* Declarations and macros for re_match_2. */ + +static int bcmp_translate (); +static boolean alt_match_null_string_p (), + common_op_match_null_string_p (), + group_match_null_string_p (); + +/* Structure for per-register (a.k.a. per-group) information. + This must not be longer than one word, because we push this value + onto the failure stack. Other register information, such as the + starting and ending positions (which are addresses), and the list of + inner groups (which is a bits list) are maintained in separate + variables. + + We are making a (strictly speaking) nonportable assumption here: that + the compiler will pack our bit fields into something that fits into + the type of `word', i.e., is something that fits into one item on the + failure stack. */ +typedef union +{ + fail_stack_elt_t word; + struct + { + /* This field is one if this group can match the empty string, + zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ +#define MATCH_NULL_UNSET_VALUE 3 + unsigned match_null_string_p : 2; + unsigned is_active : 1; + unsigned matched_something : 1; + unsigned ever_matched_something : 1; + } bits; +} register_info_type; + +#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) +#define IS_ACTIVE(R) ((R).bits.is_active) +#define MATCHED_SOMETHING(R) ((R).bits.matched_something) +#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) + + +/* Call this when have matched a real character; it sets `matched' flags + for the subexpressions which we are currently inside. Also records + that those subexprs have matched. */ +#define SET_REGS_MATCHED() \ + do \ + { \ + unsigned r; \ + for (r = lowest_active_reg; r <= highest_active_reg; r++) \ + { \ + MATCHED_SOMETHING (reg_info[r]) \ + = EVER_MATCHED_SOMETHING (reg_info[r]) \ + = 1; \ + } \ + } \ + while (0) + + +/* This converts PTR, a pointer into one of the search strings `string1' + and `string2' into an offset from the beginning of that string. */ +#define POINTER_TO_OFFSET(ptr) \ + (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) + +/* Registers are set to a sentinel when they haven't yet matched. */ +#define REG_UNSET_VALUE ((char *) -1) +#define REG_UNSET(e) ((e) == REG_UNSET_VALUE) + + +/* Macros for dealing with the split strings in re_match_2. */ + +#define MATCHING_IN_FIRST_STRING (dend == end_match_1) + +/* Call before fetching a character with *d. This switches over to + string2 if necessary. */ +#define PREFETCH() \ + while (d == dend) \ + { \ + /* End of string2 => fail. */ \ + if (dend == end_match_2) \ + goto fail; \ + /* End of string1 => advance to string2. */ \ + d = string2; \ + dend = end_match_2; \ + } + + +/* Test if at very beginning or at very end of the virtual concatenation + of `string1' and `string2'. If only one string, it's `string2'. */ +#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) +#define AT_STRINGS_END(d) ((d) == end2) + + +/* Test if D points to a character which is word-constituent. We have + two special cases to check for: if past the end of string1, look at + the first character in string2; and if before the beginning of + string2, look at the last character in string1. */ +#define WORDCHAR_P(d) \ + (SYNTAX ((d) == end1 ? *string2 \ + : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ + == Sword) + +/* Test if the character before D and the one at D differ with respect + to being word-constituent. */ +#define AT_WORD_BOUNDARY(d) \ + (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ + || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) + + +/* Free everything we malloc. */ +#ifdef REGEX_MALLOC +#define FREE_VAR(var) if (var) free (var); var = NULL +#define FREE_VARIABLES() \ + do { \ + FREE_VAR (fail_stack.stack); \ + FREE_VAR (regstart); \ + FREE_VAR (regend); \ + FREE_VAR (old_regstart); \ + FREE_VAR (old_regend); \ + FREE_VAR (best_regstart); \ + FREE_VAR (best_regend); \ + FREE_VAR (reg_info); \ + FREE_VAR (reg_dummy); \ + FREE_VAR (reg_info_dummy); \ + } while (0) +#else /* not REGEX_MALLOC */ +/* Some MIPS systems (at least) want this to free alloca'd storage. */ +#define FREE_VARIABLES() alloca (0) +#endif /* not REGEX_MALLOC */ + + +/* These values must meet several constraints. They must not be valid + register values; since we have a limit of 255 registers (because + we use only one byte in the pattern for the register number), we can + use numbers larger than 255. They must differ by 1, because of + NUM_FAILURE_ITEMS above. And the value for the lowest register must + be larger than the value for the highest register, so we do not try + to actually save any registers when none are active. */ +#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) +#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) + +/* Matching routines. */ + +#ifndef emacs /* Emacs never uses this. */ +/* re_match is like re_match_2 except it takes only a single string. */ + +int +re_match (bufp, string, size, pos, regs) + struct re_pattern_buffer *bufp; + const char *string; + int size, pos; + struct re_registers *regs; + { + return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); +} +#endif /* not emacs */ + + +/* re_match_2 matches the compiled pattern in BUFP against the + the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 + and SIZE2, respectively). We start matching at POS, and stop + matching at STOP. + + If REGS is non-null and the `no_sub' field of BUFP is nonzero, we + store offsets for the substring each group matched in REGS. See the + documentation for exactly how many groups we fill. + + We return -1 if no match, -2 if an internal error (such as the + failure stack overflowing). Otherwise, we return the length of the + matched substring. */ + +int +re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int size1, size2; + int pos; + struct re_registers *regs; + int stop; +{ + /* General temporaries. */ + int mcnt; + unsigned char *p1; + + /* Just past the end of the corresponding string. */ + const char *end1, *end2; + + /* Pointers into string1 and string2, just past the last characters in + each to consider matching. */ + const char *end_match_1, *end_match_2; + + /* Where we are in the data, and the end of the current string. */ + const char *d, *dend; + + /* Where we are in the pattern, and the end of the pattern. */ + unsigned char *p = bufp->buffer; + register unsigned char *pend = p + bufp->used; + + /* We use this to map every character in the string. */ + char *translate = bufp->translate; + + /* Failure point stack. Each place that can handle a failure further + down the line pushes a failure point on this stack. It consists of + restart, regend, and reg_info for all registers corresponding to + the subexpressions we're currently inside, plus the number of such + registers, and, finally, two char *'s. The first char * is where + to resume scanning the pattern; the second one is where to resume + scanning the strings. If the latter is zero, the failure point is + a ``dummy''; if a failure happens and the failure point is a dummy, + it gets discarded and the next next one is tried. */ + fail_stack_type fail_stack; +#ifdef DEBUG + static unsigned failure_id = 0; + unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; +#endif + + /* We fill all the registers internally, independent of what we + return, for use in backreferences. The number here includes + an element for register zero. */ + unsigned num_regs = bufp->re_nsub + 1; + + /* The currently active registers. */ + unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; + unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; + + /* Information on the contents of registers. These are pointers into + the input strings; they record just what was matched (on this + attempt) by a subexpression part of the pattern, that is, the + regnum-th regstart pointer points to where in the pattern we began + matching and the regnum-th regend points to right after where we + stopped matching the regnum-th subexpression. (The zeroth register + keeps track of what the whole pattern matches.) */ + const char **regstart, **regend; + + /* If a group that's operated upon by a repetition operator fails to + match anything, then the register for its start will need to be + restored because it will have been set to wherever in the string we + are when we last see its open-group operator. Similarly for a + register's end. */ + const char **old_regstart, **old_regend; + + /* The is_active field of reg_info helps us keep track of which (possibly + nested) subexpressions we are currently in. The matched_something + field of reg_info[reg_num] helps us tell whether or not we have + matched any of the pattern so far this time through the reg_num-th + subexpression. These two fields get reset each time through any + loop their register is in. */ + register_info_type *reg_info; + + /* The following record the register info as found in the above + variables when we find a match better than any we've seen before. + This happens as we backtrack through the failure points, which in + turn happens only if we have not yet matched the entire string. */ + unsigned best_regs_set = false; + const char **best_regstart, **best_regend; + + /* Logically, this is `best_regend[0]'. But we don't want to have to + allocate space for that if we're not allocating space for anything + else (see below). Also, we never need info about register 0 for + any of the other register vectors, and it seems rather a kludge to + treat `best_regend' differently than the rest. So we keep track of + the end of the best match so far in a separate variable. We + initialize this to NULL so that when we backtrack the first time + and need to test it, it's not garbage. */ + const char *match_end = NULL; + + /* Used when we pop values we don't care about. */ + const char **reg_dummy; + register_info_type *reg_info_dummy; + +#ifdef DEBUG + /* Counts the total number of registers pushed. */ + unsigned num_regs_pushed = 0; +#endif + + DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); + + INIT_FAIL_STACK (); + + /* Do not bother to initialize all the register variables if there are + no groups in the pattern, as it takes a fair amount of time. If + there are groups, we include space for register 0 (the whole + pattern), even though we never use it, since it simplifies the + array indexing. We should fix this. */ + if (bufp->re_nsub) + { + regstart = REGEX_TALLOC (num_regs, const char *); + regend = REGEX_TALLOC (num_regs, const char *); + old_regstart = REGEX_TALLOC (num_regs, const char *); + old_regend = REGEX_TALLOC (num_regs, const char *); + best_regstart = REGEX_TALLOC (num_regs, const char *); + best_regend = REGEX_TALLOC (num_regs, const char *); + reg_info = REGEX_TALLOC (num_regs, register_info_type); + reg_dummy = REGEX_TALLOC (num_regs, const char *); + reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); + + if (!(regstart && regend && old_regstart && old_regend && reg_info + && best_regstart && best_regend && reg_dummy && reg_info_dummy)) + { + FREE_VARIABLES (); + return -2; + } + } +#ifdef REGEX_MALLOC + else + { + /* We must initialize all our variables to NULL, so that + `FREE_VARIABLES' doesn't try to free them. */ + regstart = regend = old_regstart = old_regend = best_regstart + = best_regend = reg_dummy = NULL; + reg_info = reg_info_dummy = (register_info_type *) NULL; + } +#endif /* REGEX_MALLOC */ + + /* The starting position is bogus. */ + if (pos < 0 || pos > size1 + size2) + { + FREE_VARIABLES (); + return -1; + } + + /* Initialize subexpression text positions to -1 to mark ones that no + start_memory/stop_memory has been seen for. Also initialize the + register information struct. */ + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + regstart[mcnt] = regend[mcnt] + = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; + + REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; + IS_ACTIVE (reg_info[mcnt]) = 0; + MATCHED_SOMETHING (reg_info[mcnt]) = 0; + EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; + } + + /* We move `string1' into `string2' if the latter's empty -- but not if + `string1' is null. */ + if (size2 == 0 && string1 != NULL) + { + string2 = string1; + size2 = size1; + string1 = 0; + size1 = 0; + } + end1 = string1 + size1; + end2 = string2 + size2; + + /* Compute where to stop matching, within the two strings. */ + if (stop <= size1) + { + end_match_1 = string1 + stop; + end_match_2 = string2; + } + else + { + end_match_1 = end1; + end_match_2 = string2 + stop - size1; + } + + /* `p' scans through the pattern as `d' scans through the data. + `dend' is the end of the input string that `d' points within. `d' + is advanced into the following input string whenever necessary, but + this happens before fetching; therefore, at the beginning of the + loop, `d' can be pointing at the end of a string, but it cannot + equal `string2'. */ + if (size1 > 0 && pos <= size1) + { + d = string1 + pos; + dend = end_match_1; + } + else + { + d = string2 + pos - size1; + dend = end_match_2; + } + + DEBUG_PRINT1 ("The compiled pattern is: "); + DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); + DEBUG_PRINT1 ("The string to match is: `"); + DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); + DEBUG_PRINT1 ("'\n"); + + /* This loops over pattern commands. It exits by returning from the + function if the match is complete, or it drops through if the match + fails at this starting point in the input data. */ + for (;;) + { + DEBUG_PRINT2 ("\n0x%x: ", p); + + if (p == pend) + { /* End of pattern means we might have succeeded. */ + DEBUG_PRINT1 ("end of pattern ... "); + + /* If we haven't matched the entire string, and we want the + longest match, try backtracking. */ + if (d != end_match_2) + { + DEBUG_PRINT1 ("backtracking.\n"); + + if (!FAIL_STACK_EMPTY ()) + { /* More failure points to try. */ + boolean same_str_p = (FIRST_STRING_P (match_end) + == MATCHING_IN_FIRST_STRING); + + /* If exceeds best match so far, save it. */ + if (!best_regs_set + || (same_str_p && d > match_end) + || (!same_str_p && !MATCHING_IN_FIRST_STRING)) + { + best_regs_set = true; + match_end = d; + + DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); + + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + best_regstart[mcnt] = regstart[mcnt]; + best_regend[mcnt] = regend[mcnt]; + } + } + goto fail; + } + + /* If no failure points, don't restore garbage. */ + else if (best_regs_set) + { + restore_best_regs: + /* Restore best match. It may happen that `dend == + end_match_1' while the restored d is in string2. + For example, the pattern `x.*y.*z' against the + strings `x-' and `y-z-', if the two strings are + not consecutive in memory. */ + DEBUG_PRINT1 ("Restoring best registers.\n"); + + d = match_end; + dend = ((d >= string1 && d <= end1) + ? end_match_1 : end_match_2); + + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + regstart[mcnt] = best_regstart[mcnt]; + regend[mcnt] = best_regend[mcnt]; + } + } + } /* d != end_match_2 */ + + DEBUG_PRINT1 ("Accepting match.\n"); + + /* If caller wants register contents data back, do it. */ + if (regs && !bufp->no_sub) + { + /* Have the register data arrays been allocated? */ + if (bufp->regs_allocated == REGS_UNALLOCATED) + { /* No. So allocate them with malloc. We need one + extra element beyond `num_regs' for the `-1' marker + GNU code uses. */ + regs->num_regs = MAX (RE_NREGS, num_regs + 1); + regs->start = TALLOC (regs->num_regs, regoff_t); + regs->end = TALLOC (regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) + return -2; + bufp->regs_allocated = REGS_REALLOCATE; + } + else if (bufp->regs_allocated == REGS_REALLOCATE) + { /* Yes. If we need more elements than were already + allocated, reallocate them. If we need fewer, just + leave it alone. */ + if (regs->num_regs < num_regs + 1) + { + regs->num_regs = num_regs + 1; + RETALLOC (regs->start, regs->num_regs, regoff_t); + RETALLOC (regs->end, regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) + return -2; + } + } + else + assert (bufp->regs_allocated == REGS_FIXED); + + /* Convert the pointer data in `regstart' and `regend' to + indices. Register zero has to be set differently, + since we haven't kept track of any info for it. */ + if (regs->num_regs > 0) + { + regs->start[0] = pos; + regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 + : d - string2 + size1); + } + + /* Go through the first `min (num_regs, regs->num_regs)' + registers, since that is all we initialized. */ + for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) + { + if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) + regs->start[mcnt] = regs->end[mcnt] = -1; + else + { + regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); + regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); + } + } + + /* If the regs structure we return has more elements than + were in the pattern, set the extra elements to -1. If + we (re)allocated the registers, this is the case, + because we always allocate enough to have at least one + -1 at the end. */ + for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) + regs->start[mcnt] = regs->end[mcnt] = -1; + } /* regs && !bufp->no_sub */ + + FREE_VARIABLES (); + DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", + nfailure_points_pushed, nfailure_points_popped, + nfailure_points_pushed - nfailure_points_popped); + DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); + + mcnt = d - pos - (MATCHING_IN_FIRST_STRING + ? string1 + : string2 - size1); + + DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); + + return mcnt; + } + + /* Otherwise match next pattern command. */ +#ifdef SWITCH_ENUM_BUG + switch ((int) ((re_opcode_t) *p++)) +#else + switch ((re_opcode_t) *p++) +#endif + { + /* Ignore these. Used to ignore the n of succeed_n's which + currently have n == 0. */ + case no_op: + DEBUG_PRINT1 ("EXECUTING no_op.\n"); + break; + + + /* Match the next n pattern characters exactly. The following + byte in the pattern defines n, and the n bytes after that + are the characters to match. */ + case exactn: + mcnt = *p++; + DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); + + /* This is written out as an if-else so we don't waste time + testing `translate' inside the loop. */ + if (translate) + { + do + { + PREFETCH (); + if (translate[(unsigned char) *d++] != (char) *p++) + goto fail; + } + while (--mcnt); + } + else + { + do + { + PREFETCH (); + if (*d++ != (char) *p++) goto fail; + } + while (--mcnt); + } + SET_REGS_MATCHED (); + break; + + + /* Match any character except possibly a newline or a null. */ + case anychar: + DEBUG_PRINT1 ("EXECUTING anychar.\n"); + + PREFETCH (); + + if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') + || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) + goto fail; + + SET_REGS_MATCHED (); + DEBUG_PRINT2 (" Matched `%d'.\n", *d); + d++; + break; + + + case charset: + case charset_not: + { + register unsigned char c; + boolean not = (re_opcode_t) *(p - 1) == charset_not; + + DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); + + PREFETCH (); + c = TRANSLATE (*d); /* The character to match. */ + + /* Cast to `unsigned' instead of `unsigned char' in case the + bit list is a full 32 bytes long. */ + if (c < (unsigned) (*p * BYTEWIDTH) + && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; + + p += 1 + *p; + + if (!not) goto fail; + + SET_REGS_MATCHED (); + d++; + break; + } + + + /* The beginning of a group is represented by start_memory. + The arguments are the register number in the next byte, and the + number of groups inner to this one in the next. The text + matched within the group is recorded (in the internal + registers data structure) under the register number. */ + case start_memory: + DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); + + /* Find out if this group can match the empty string. */ + p1 = p; /* To send to group_match_null_string_p. */ + + if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[*p]) + = group_match_null_string_p (&p1, pend, reg_info); + + /* Save the position in the string where we were the last time + we were at this open-group operator in case the group is + operated upon by a repetition operator, e.g., with `(a*)*b' + against `ab'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regstart[*p]) ? d : regstart[*p] + : regstart[*p]; + DEBUG_PRINT2 (" old_regstart: %d\n", + POINTER_TO_OFFSET (old_regstart[*p])); + + regstart[*p] = d; + DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); + + IS_ACTIVE (reg_info[*p]) = 1; + MATCHED_SOMETHING (reg_info[*p]) = 0; + + /* This is the new highest active register. */ + highest_active_reg = *p; + + /* If nothing was active before, this is the new lowest active + register. */ + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *p; + + /* Move past the register number and inner group count. */ + p += 2; + break; + + + /* The stop_memory opcode represents the end of a group. Its + arguments are the same as start_memory's: the register + number, and the number of inner groups. */ + case stop_memory: + DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); + + /* We need to save the string position the last time we were at + this close-group operator in case the group is operated + upon by a repetition operator, e.g., with `((a*)*(b*)*)*' + against `aba'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regend[*p]) ? d : regend[*p] + : regend[*p]; + DEBUG_PRINT2 (" old_regend: %d\n", + POINTER_TO_OFFSET (old_regend[*p])); + + regend[*p] = d; + DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); + + /* This register isn't active anymore. */ + IS_ACTIVE (reg_info[*p]) = 0; + + /* If this was the only register active, nothing is active + anymore. */ + if (lowest_active_reg == highest_active_reg) + { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } + else + { /* We must scan for the new highest active register, since + it isn't necessarily one less than now: consider + (a(b)c(d(e)f)g). When group 3 ends, after the f), the + new highest active register is 1. */ + unsigned char r = *p - 1; + while (r > 0 && !IS_ACTIVE (reg_info[r])) + r--; + + /* If we end up at register zero, that means that we saved + the registers as the result of an `on_failure_jump', not + a `start_memory', and we jumped to past the innermost + `stop_memory'. For example, in ((.)*) we save + registers 1 and 2 as a result of the *, but when we pop + back to the second ), we are at the stop_memory 1. + Thus, nothing is active. */ + if (r == 0) + { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } + else + highest_active_reg = r; + } + + /* If just failed to match something this time around with a + group that's operated on by a repetition operator, try to + force exit from the ``loop'', and restore the register + information for this group that we had before trying this + last match. */ + if ((!MATCHED_SOMETHING (reg_info[*p]) + || (re_opcode_t) p[-3] == start_memory) + && (p + 2) < pend) + { + boolean is_a_jump_n = false; + + p1 = p + 2; + mcnt = 0; + switch ((re_opcode_t) *p1++) + { + case jump_n: + is_a_jump_n = true; + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if (is_a_jump_n) + p1 += 2; + break; + + default: + /* do nothing */ ; + } + p1 += mcnt; + + /* If the next operation is a jump backwards in the pattern + to an on_failure_jump right before the start_memory + corresponding to this stop_memory, exit from the loop + by forcing a failure after pushing on the stack the + on_failure_jump's jump in the pattern, and d. */ + if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump + && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) + { + /* If this group ever matched anything, then restore + what its registers were before trying this last + failed match, e.g., with `(a*)*b' against `ab' for + regstart[1], and, e.g., with `((a*)*(b*)*)*' + against `aba' for regend[3]. + + Also restore the registers for inner groups for, + e.g., `((a*)(b*))*' against `aba' (register 3 would + otherwise get trashed). */ + + if (EVER_MATCHED_SOMETHING (reg_info[*p])) + { + unsigned r; + + EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; + + /* Restore this and inner groups' (if any) registers. */ + for (r = *p; r < *p + *(p + 1); r++) + { + regstart[r] = old_regstart[r]; + + /* xx why this test? */ + if ((int) old_regend[r] >= (int) regstart[r]) + regend[r] = old_regend[r]; + } + } + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + PUSH_FAILURE_POINT (p1 + mcnt, d, -2); + + goto fail; + } + } + + /* Move past the register number and the inner group count. */ + p += 2; + break; + + + /* \ has been turned into a `duplicate' command which is + followed by the numeric value of as the register number. */ + case duplicate: + { + register const char *d2, *dend2; + int regno = *p++; /* Get which register to match against. */ + DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); + + /* Can't back reference a group which we've never matched. */ + if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) + goto fail; + + /* Where in input to try to start matching. */ + d2 = regstart[regno]; + + /* Where to stop matching; if both the place to start and + the place to stop matching are in the same string, then + set to the place to stop, otherwise, for now have to use + the end of the first string. */ + + dend2 = ((FIRST_STRING_P (regstart[regno]) + == FIRST_STRING_P (regend[regno])) + ? regend[regno] : end_match_1); + for (;;) + { + /* If necessary, advance to next segment in register + contents. */ + while (d2 == dend2) + { + if (dend2 == end_match_2) break; + if (dend2 == regend[regno]) break; + + /* End of string1 => advance to string2. */ + d2 = string2; + dend2 = regend[regno]; + } + /* At end of register contents => success */ + if (d2 == dend2) break; + + /* If necessary, advance to next segment in data. */ + PREFETCH (); + + /* How many characters left in this segment to match. */ + mcnt = dend - d; + + /* Want how many consecutive characters we can match in + one shot, so, if necessary, adjust the count. */ + if (mcnt > dend2 - d2) + mcnt = dend2 - d2; + + /* Compare that many; failure if mismatch, else move + past them. */ + if (translate + ? bcmp_translate (d, d2, mcnt, translate) + : bcmp (d, d2, mcnt)) + goto fail; + d += mcnt, d2 += mcnt; + } + } + break; + + + /* begline matches the empty string at the beginning of the string + (unless `not_bol' is set in `bufp'), and, if + `newline_anchor' is set, after newlines. */ + case begline: + DEBUG_PRINT1 ("EXECUTING begline.\n"); + + if (AT_STRINGS_BEG (d)) + { + if (!bufp->not_bol) break; + } + else if (d[-1] == '\n' && bufp->newline_anchor) + { + break; + } + /* In all other cases, we fail. */ + goto fail; + + + /* endline is the dual of begline. */ + case endline: + DEBUG_PRINT1 ("EXECUTING endline.\n"); + + if (AT_STRINGS_END (d)) + { + if (!bufp->not_eol) break; + } + + /* We have to ``prefetch'' the next character. */ + else if ((d == end1 ? *string2 : *d) == '\n' + && bufp->newline_anchor) + { + break; + } + goto fail; + + + /* Match at the very beginning of the data. */ + case begbuf: + DEBUG_PRINT1 ("EXECUTING begbuf.\n"); + if (AT_STRINGS_BEG (d)) + break; + goto fail; + + + /* Match at the very end of the data. */ + case endbuf: + DEBUG_PRINT1 ("EXECUTING endbuf.\n"); + if (AT_STRINGS_END (d)) + break; + goto fail; + + + /* on_failure_keep_string_jump is used to optimize `.*\n'. It + pushes NULL as the value for the string on the stack. Then + `pop_failure_point' will keep the current value for the + string, instead of restoring it. To see why, consider + matching `foo\nbar' against `.*\n'. The .* matches the foo; + then the . fails against the \n. But the next thing we want + to do is match the \n against the \n; if we restored the + string value, we would be back at the foo. + + Because this is used only in specific cases, we don't need to + check all the things that `on_failure_jump' does, to make + sure the right things get saved on the stack. Hence we don't + share its code. The only reason to push anything on the + stack at all is that otherwise we would have to change + `anychar's code to do something besides goto fail in this + case; that seems worse than this. */ + case on_failure_keep_string_jump: + DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); + + PUSH_FAILURE_POINT (p + mcnt, NULL, -2); + break; + + + /* Uses of on_failure_jump: + + Each alternative starts with an on_failure_jump that points + to the beginning of the next alternative. Each alternative + except the last ends with a jump that in effect jumps past + the rest of the alternatives. (They really jump to the + ending jump of the following alternative, because tensioning + these jumps is a hassle.) + + Repeats start with an on_failure_jump that points past both + the repetition text and either the following jump or + pop_failure_jump back to this on_failure_jump. */ + case on_failure_jump: + on_failure: + DEBUG_PRINT1 ("EXECUTING on_failure_jump"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); + + /* If this on_failure_jump comes right before a group (i.e., + the original * applied to a group), save the information + for that group and all inner ones, so that if we fail back + to this point, the group's information will be correct. + For example, in \(a*\)*\1, we need the preceding group, + and in \(\(a*\)b*\)\2, we need the inner group. */ + + /* We can't use `p' to check ahead because we push + a failure point to `p + mcnt' after we do this. */ + p1 = p; + + /* We need to skip no_op's before we look for the + start_memory in case this on_failure_jump is happening as + the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 + against aba. */ + while (p1 < pend && (re_opcode_t) *p1 == no_op) + p1++; + + if (p1 < pend && (re_opcode_t) *p1 == start_memory) + { + /* We have a new highest active register now. This will + get reset at the start_memory we are about to get to, + but we will have saved all the registers relevant to + this repetition op, as described above. */ + highest_active_reg = *(p1 + 1) + *(p1 + 2); + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *(p1 + 1); + } + + DEBUG_PRINT1 (":\n"); + PUSH_FAILURE_POINT (p + mcnt, d, -2); + break; + + + /* A smart repeat ends with `maybe_pop_jump'. + We change it to either `pop_failure_jump' or `jump'. */ + case maybe_pop_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); + { + register unsigned char *p2 = p; + + /* Compare the beginning of the repeat with what in the + pattern follows its end. If we can establish that there + is nothing that they would both match, i.e., that we + would have to backtrack because of (as in, e.g., `a*a') + then we can change to pop_failure_jump, because we'll + never have to backtrack. + + This is not true in the case of alternatives: in + `(a|ab)*' we do need to backtrack to the `ab' alternative + (e.g., if the string was `ab'). But instead of trying to + detect that here, the alternative has put on a dummy + failure point which is what we will end up popping. */ + + /* Skip over open/close-group commands. */ + while (p2 + 2 < pend + && ((re_opcode_t) *p2 == stop_memory + || (re_opcode_t) *p2 == start_memory)) + p2 += 3; /* Skip over args, too. */ + + /* If we're at the end of the pattern, we can change. */ + if (p2 == pend) + { + /* Consider what happens when matching ":\(.*\)" + against ":/". I don't really understand this code + yet. */ + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 + (" End of pattern: change to `pop_failure_jump'.\n"); + } + + else if ((re_opcode_t) *p2 == exactn + || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) + { + register unsigned char c + = *p2 == (unsigned char) endline ? '\n' : p2[2]; + p1 = p + mcnt; + + /* p1[0] ... p1[2] are the `on_failure_jump' corresponding + to the `maybe_finalize_jump' of this case. Examine what + follows. */ + if ((re_opcode_t) p1[3] == exactn && p1[5] != c) + { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", + c, p1[5]); + } + + else if ((re_opcode_t) p1[3] == charset + || (re_opcode_t) p1[3] == charset_not) + { + int not = (re_opcode_t) p1[3] == charset_not; + + if (c < (unsigned char) (p1[4] * BYTEWIDTH) + && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; + + /* `not' is equal to 1 if c would match, which means + that we can't change to pop_failure_jump. */ + if (!not) + { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); + } + } + } + } + p -= 2; /* Point at relative address again. */ + if ((re_opcode_t) p[-1] != pop_failure_jump) + { + p[-1] = (unsigned char) jump; + DEBUG_PRINT1 (" Match => jump.\n"); + goto unconditional_jump; + } + /* Note fall through. */ + + + /* The end of a simple repeat has a pop_failure_jump back to + its matching on_failure_jump, where the latter will push a + failure point. The pop_failure_jump takes off failure + points put on by this pop_failure_jump's matching + on_failure_jump; we got through the pattern to here from the + matching on_failure_jump, so didn't fail. */ + case pop_failure_jump: + { + /* We need to pass separate storage for the lowest and + highest registers, even though we don't care about the + actual values. Otherwise, we will restore only one + register from the stack, since lowest will == highest in + `pop_failure_point'. */ + unsigned dummy_low_reg, dummy_high_reg; + unsigned char *pdummy; + const char *sdummy; + + DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); + POP_FAILURE_POINT (sdummy, pdummy, + dummy_low_reg, dummy_high_reg, + reg_dummy, reg_dummy, reg_info_dummy); + } + /* Note fall through. */ + + + /* Unconditionally jump (without popping any failure points). */ + case jump: + unconditional_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ + DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); + p += mcnt; /* Do the jump. */ + DEBUG_PRINT2 ("(to 0x%x).\n", p); + break; + + + /* We need this opcode so we can detect where alternatives end + in `group_match_null_string_p' et al. */ + case jump_past_alt: + DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); + goto unconditional_jump; + + + /* Normally, the on_failure_jump pushes a failure point, which + then gets popped at pop_failure_jump. We will end up at + pop_failure_jump, also, and with a pattern of, say, `a+', we + are skipping over the on_failure_jump, so we have to push + something meaningless for pop_failure_jump to pop. */ + case dummy_failure_jump: + DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); + /* It doesn't matter what we push for the string here. What + the code at `fail' tests is the value for the pattern. */ + PUSH_FAILURE_POINT (0, 0, -2); + goto unconditional_jump; + + + /* At the end of an alternative, we need to push a dummy failure + point in case we are followed by a `pop_failure_jump', because + we don't want the failure point for the alternative to be + popped. For example, matching `(a|ab)*' against `aab' + requires that we match the `ab' alternative. */ + case push_dummy_failure: + DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); + /* See comments just above at `dummy_failure_jump' about the + two zeroes. */ + PUSH_FAILURE_POINT (0, 0, -2); + break; + + /* Have to succeed matching what follows at least n times. + After that, handle like `on_failure_jump'. */ + case succeed_n: + EXTRACT_NUMBER (mcnt, p + 2); + DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); + + assert (mcnt >= 0); + /* Originally, this is how many times we HAVE to succeed. */ + if (mcnt > 0) + { + mcnt--; + p += 2; + STORE_NUMBER_AND_INCR (p, mcnt); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); + } + else if (mcnt == 0) + { + DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); + p[2] = (unsigned char) no_op; + p[3] = (unsigned char) no_op; + goto on_failure; + } + break; + + case jump_n: + EXTRACT_NUMBER (mcnt, p + 2); + DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); + + /* Originally, this is how many times we CAN jump. */ + if (mcnt) + { + mcnt--; + STORE_NUMBER (p + 2, mcnt); + goto unconditional_jump; + } + /* If don't have to jump any more, skip over the rest of command. */ + else + p += 4; + break; + + case set_number_at: + { + DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + p1 = p + mcnt; + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); + STORE_NUMBER (p1, mcnt); + break; + } + + case wordbound: + DEBUG_PRINT1 ("EXECUTING wordbound.\n"); + if (AT_WORD_BOUNDARY (d)) + break; + goto fail; + + case notwordbound: + DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); + if (AT_WORD_BOUNDARY (d)) + goto fail; + break; + + case wordbeg: + DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); + if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) + break; + goto fail; + + case wordend: + DEBUG_PRINT1 ("EXECUTING wordend.\n"); + if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) + && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) + break; + goto fail; + +#ifdef emacs +#ifdef emacs19 + case before_dot: + DEBUG_PRINT1 ("EXECUTING before_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) >= point) + goto fail; + break; + + case at_dot: + DEBUG_PRINT1 ("EXECUTING at_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) != point) + goto fail; + break; + + case after_dot: + DEBUG_PRINT1 ("EXECUTING after_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) <= point) + goto fail; + break; +#else /* not emacs19 */ + case at_dot: + DEBUG_PRINT1 ("EXECUTING at_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point) + goto fail; + break; +#endif /* not emacs19 */ + + case syntaxspec: + DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchsyntax; + + case wordchar: + DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); + mcnt = (int) Sword; + matchsyntax: + PREFETCH (); + if (SYNTAX (*d++) != (enum syntaxcode) mcnt) + goto fail; + SET_REGS_MATCHED (); + break; + + case notsyntaxspec: + DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchnotsyntax; + + case notwordchar: + DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); + mcnt = (int) Sword; + matchnotsyntax: + PREFETCH (); + if (SYNTAX (*d++) == (enum syntaxcode) mcnt) + goto fail; + SET_REGS_MATCHED (); + break; + +#else /* not emacs */ + case wordchar: + DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); + PREFETCH (); + if (!WORDCHAR_P (d)) + goto fail; + SET_REGS_MATCHED (); + d++; + break; + + case notwordchar: + DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); + PREFETCH (); + if (WORDCHAR_P (d)) + goto fail; + SET_REGS_MATCHED (); + d++; + break; +#endif /* not emacs */ + + default: + abort (); + } + continue; /* Successfully executed one pattern command; keep going. */ + + + /* We goto here if a matching operation fails. */ + fail: + if (!FAIL_STACK_EMPTY ()) + { /* A restart point is known. Restore to that state. */ + DEBUG_PRINT1 ("\nFAIL:\n"); + POP_FAILURE_POINT (d, p, + lowest_active_reg, highest_active_reg, + regstart, regend, reg_info); + + /* If this failure point is a dummy, try the next one. */ + if (!p) + goto fail; + + /* If we failed to the end of the pattern, don't examine *p. */ + assert (p <= pend); + if (p < pend) + { + boolean is_a_jump_n = false; + + /* If failed to a backwards jump that's part of a repetition + loop, need to pop this failure point and use the next one. */ + switch ((re_opcode_t) *p) + { + case jump_n: + is_a_jump_n = true; + case maybe_pop_jump: + case pop_failure_jump: + case jump: + p1 = p + 1; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + + if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) + || (!is_a_jump_n + && (re_opcode_t) *p1 == on_failure_jump)) + goto fail; + break; + default: + /* do nothing */ ; + } + } + + if (d >= string1 && d <= end1) + dend = end_match_1; + } + else + break; /* Matching at this starting point really fails. */ + } /* for (;;) */ + + if (best_regs_set) + goto restore_best_regs; + + FREE_VARIABLES (); + + return -1; /* Failure to match. */ +} /* re_match_2 */ + +/* Subroutine definitions for re_match_2. */ + + +/* We are passed P pointing to a register number after a start_memory. + + Return true if the pattern up to the corresponding stop_memory can + match the empty string, and false otherwise. + + If we find the matching stop_memory, sets P to point to one past its number. + Otherwise, sets P to an undefined byte less than or equal to END. + + We don't handle duplicates properly (yet). */ + +static boolean +group_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; +{ + int mcnt; + /* Point to after the args to the start_memory. */ + unsigned char *p1 = *p + 2; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and return true or + false, as appropriate, when we get to one that can't, or to the + matching stop_memory. */ + + switch ((re_opcode_t) *p1) + { + /* Could be either a loop or a series of alternatives. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + /* If the next operation is not a jump backwards in the + pattern. */ + + if (mcnt >= 0) + { + /* Go through the on_failure_jumps of the alternatives, + seeing if any of the alternatives cannot match nothing. + The last alternative starts with only a jump, + whereas the rest start with on_failure_jump and end + with a jump, e.g., here is the pattern for `a|b|c': + + /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 + /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 + /exactn/1/c + + So, we have to first go through the first (n-1) + alternatives and then deal with the last one separately. */ + + + /* Deal with the first (n-1) alternatives, which start + with an on_failure_jump (see above) that jumps to right + past a jump_past_alt. */ + + while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) + { + /* `mcnt' holds how many bytes long the alternative + is, including the ending `jump_past_alt' and + its number. */ + + if (!alt_match_null_string_p (p1, p1 + mcnt - 3, + reg_info)) + return false; + + /* Move to right after this alternative, including the + jump_past_alt. */ + p1 += mcnt; + + /* Break if it's the beginning of an n-th alternative + that doesn't begin with an on_failure_jump. */ + if ((re_opcode_t) *p1 != on_failure_jump) + break; + + /* Still have to check that it's not an n-th + alternative that starts with an on_failure_jump. */ + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) + { + /* Get to the beginning of the n-th alternative. */ + p1 -= 3; + break; + } + } + + /* Deal with the last alternative: go back and get number + of the `jump_past_alt' just before it. `mcnt' contains + the length of the alternative. */ + EXTRACT_NUMBER (mcnt, p1 - 2); + + if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) + return false; + + p1 += mcnt; /* Get past the n-th alternative. */ + } /* if mcnt > 0 */ + break; + + + case stop_memory: + assert (p1[1] == **p); + *p = p1 + 2; + return true; + + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return false; +} /* group_match_null_string_p */ + + +/* Similar to group_match_null_string_p, but doesn't deal with alternatives: + It expects P to be the first byte of a single alternative and END one + byte past the last. The alternative can contain groups. */ + +static boolean +alt_match_null_string_p (p, end, reg_info) + unsigned char *p, *end; + register_info_type *reg_info; +{ + int mcnt; + unsigned char *p1 = p; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and break when we get + to one that can't. */ + + switch ((re_opcode_t) *p1) + { + /* It's a loop. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + break; + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return true; +} /* alt_match_null_string_p */ + + +/* Deals with the ops common to group_match_null_string_p and + alt_match_null_string_p. + + Sets P to one after the op and its arguments, if any. */ + +static boolean +common_op_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; +{ + int mcnt; + boolean ret; + int reg_no; + unsigned char *p1 = *p; + + switch ((re_opcode_t) *p1++) + { + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbeg: + case wordend: + case wordbound: + case notwordbound: +#ifdef emacs + case before_dot: + case at_dot: + case after_dot: +#endif + break; + + case start_memory: + reg_no = *p1; + assert (reg_no > 0 && reg_no <= MAX_REGNUM); + ret = group_match_null_string_p (&p1, end, reg_info); + + /* Have to set this here in case we're checking a group which + contains a group and a back reference to it. */ + + if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; + + if (!ret) + return false; + break; + + /* If this is an optimized succeed_n for zero times, make the jump. */ + case jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if (mcnt >= 0) + p1 += mcnt; + else + return false; + break; + + case succeed_n: + /* Get to the number of times to succeed. */ + p1 += 2; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + if (mcnt == 0) + { + p1 -= 4; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + } + else + return false; + break; + + case duplicate: + if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) + return false; + break; + + case set_number_at: + p1 += 4; + + default: + /* All other opcodes mean we cannot match the empty string. */ + return false; + } + + *p = p1; + return true; +} /* common_op_match_null_string_p */ + + +/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN + bytes; nonzero otherwise. */ + +static int +bcmp_translate (s1, s2, len, translate) + unsigned char *s1, *s2; + register int len; + char *translate; +{ + register unsigned char *p1 = s1, *p2 = s2; + while (len) + { + if (translate[*p1++] != translate[*p2++]) return 1; + len--; + } + return 0; +} + +/* Entry points for GNU code. */ + +/* re_compile_pattern is the GNU regular expression compiler: it + compiles PATTERN (of length SIZE) and puts the result in BUFP. + Returns 0 if the pattern was valid, otherwise an error string. + + Assumes the `allocated' (and perhaps `buffer') and `translate' fields + are set in BUFP on entry. + + We call regex_compile to do the actual compilation. */ + +const char * +re_compile_pattern (pattern, length, bufp) + const char *pattern; + int length; + struct re_pattern_buffer *bufp; +{ + reg_errcode_t ret; + + /* GNU code is written to assume at least RE_NREGS registers will be set + (and at least one extra will be -1). */ + bufp->regs_allocated = REGS_UNALLOCATED; + + /* And GNU code determines whether or not to get register information + by passing null for the REGS argument to re_match, etc., not by + setting no_sub. */ + bufp->no_sub = 0; + + /* Match anchors at newline. */ + bufp->newline_anchor = 1; + + ret = regex_compile (pattern, length, re_syntax_options, bufp); + + return re_error_msg[(int) ret]; +} + +/* Entry points compatible with 4.2 BSD regex library. We don't define + them if this is an Emacs or POSIX compilation. */ + +#if !defined (emacs) && !defined (_POSIX_SOURCE) + +/* BSD has one and only one pattern buffer. */ +static struct re_pattern_buffer re_comp_buf; + +char * +re_comp (s) + const char *s; +{ + reg_errcode_t ret; + + if (!s) + { + if (!re_comp_buf.buffer) + return "No previous regular expression"; + return 0; + } + + if (!re_comp_buf.buffer) + { + re_comp_buf.buffer = (unsigned char *) malloc (200); + if (re_comp_buf.buffer == NULL) + return "Memory exhausted"; + re_comp_buf.allocated = 200; + + re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); + if (re_comp_buf.fastmap == NULL) + return "Memory exhausted"; + } + + /* Since `re_exec' always passes NULL for the `regs' argument, we + don't need to initialize the pattern buffer fields which affect it. */ + + /* Match anchors at newlines. */ + re_comp_buf.newline_anchor = 1; + + ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); + + /* Yes, we're discarding `const' here. */ + return (char *) re_error_msg[(int) ret]; +} + + +int +re_exec (s) + const char *s; +{ + const int len = strlen (s); + return + 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); +} +#endif /* not emacs and not _POSIX_SOURCE */ + +/* POSIX.2 functions. Don't define these for Emacs. */ + +#ifndef emacs + +/* regcomp takes a regular expression as a string and compiles it. + + PREG is a regex_t *. We do not expect any fields to be initialized, + since POSIX says we shouldn't. Thus, we set + + `buffer' to the compiled pattern; + `used' to the length of the compiled pattern; + `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + REG_EXTENDED bit in CFLAGS is set; otherwise, to + RE_SYNTAX_POSIX_BASIC; + `newline_anchor' to REG_NEWLINE being set in CFLAGS; + `fastmap' and `fastmap_accurate' to zero; + `re_nsub' to the number of subexpressions in PATTERN. + + PATTERN is the address of the pattern string. + + CFLAGS is a series of bits which affect compilation. + + If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we + use POSIX basic syntax. + + If REG_NEWLINE is set, then . and [^...] don't match newline. + Also, regexec will try a match beginning after every newline. + + If REG_ICASE is set, then we considers upper- and lowercase + versions of letters to be equivalent when matching. + + If REG_NOSUB is set, then when PREG is passed to regexec, that + routine will report only success or failure, and nothing about the + registers. + + It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for + the return codes and their meanings.) */ + +int +regcomp (preg, pattern, cflags) + regex_t *preg; + const char *pattern; + int cflags; +{ + reg_errcode_t ret; + unsigned syntax + = (cflags & REG_EXTENDED) ? + RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; + + /* regex_compile will allocate the space for the compiled pattern. */ + preg->buffer = 0; + preg->allocated = 0; + + /* Don't bother to use a fastmap when searching. This simplifies the + REG_NEWLINE case: if we used a fastmap, we'd have to put all the + characters after newlines into the fastmap. This way, we just try + every character. */ + preg->fastmap = 0; + + if (cflags & REG_ICASE) + { + unsigned i; + + preg->translate = (char *) malloc (CHAR_SET_SIZE); + if (preg->translate == NULL) + return (int) REG_ESPACE; + + /* Map uppercase characters to corresponding lowercase ones. */ + for (i = 0; i < CHAR_SET_SIZE; i++) + preg->translate[i] = ISUPPER (i) ? tolower (i) : i; + } + else + preg->translate = NULL; + + /* If REG_NEWLINE is set, newlines are treated differently. */ + if (cflags & REG_NEWLINE) + { /* REG_NEWLINE implies neither . nor [^...] match newline. */ + syntax &= ~RE_DOT_NEWLINE; + syntax |= RE_HAT_LISTS_NOT_NEWLINE; + /* It also changes the matching behavior. */ + preg->newline_anchor = 1; + } + else + preg->newline_anchor = 0; + + preg->no_sub = !!(cflags & REG_NOSUB); + + /* POSIX says a null character in the pattern terminates it, so we + can use strlen here in compiling the pattern. */ + ret = regex_compile (pattern, strlen (pattern), syntax, preg); + + /* POSIX doesn't distinguish between an unmatched open-group and an + unmatched close-group: both are REG_EPAREN. */ + if (ret == REG_ERPAREN) ret = REG_EPAREN; + + return (int) ret; +} + + +/* regexec searches for a given pattern, specified by PREG, in the + string STRING. + + If NMATCH is zero or REG_NOSUB was set in the cflags argument to + `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at + least NMATCH elements, and we set them to the offsets of the + corresponding matched substrings. + + EFLAGS specifies `execution flags' which affect matching: if + REG_NOTBOL is set, then ^ does not match at the beginning of the + string; if REG_NOTEOL is set, then $ does not match at the end. + + We return 0 if we find a match and REG_NOMATCH if not. */ + +int +regexec (preg, string, nmatch, pmatch, eflags) + const regex_t *preg; + const char *string; + size_t nmatch; + regmatch_t pmatch[]; + int eflags; +{ + int ret; + struct re_registers regs; + regex_t private_preg; + int len = strlen (string); + boolean want_reg_info = !preg->no_sub && nmatch > 0; + + private_preg = *preg; + + private_preg.not_bol = !!(eflags & REG_NOTBOL); + private_preg.not_eol = !!(eflags & REG_NOTEOL); + + /* The user has told us exactly how many registers to return + information about, via `nmatch'. We have to pass that on to the + matching routines. */ + private_preg.regs_allocated = REGS_FIXED; + + if (want_reg_info) + { + regs.num_regs = nmatch; + regs.start = TALLOC (nmatch, regoff_t); + regs.end = TALLOC (nmatch, regoff_t); + if (regs.start == NULL || regs.end == NULL) + return (int) REG_NOMATCH; + } + + /* Perform the searching operation. */ + ret = re_search (&private_preg, string, len, + /* start: */ 0, /* range: */ len, + want_reg_info ? ®s : (struct re_registers *) 0); + + /* Copy the register information to the POSIX structure. */ + if (want_reg_info) + { + if (ret >= 0) + { + unsigned r; + + for (r = 0; r < nmatch; r++) + { + pmatch[r].rm_so = regs.start[r]; + pmatch[r].rm_eo = regs.end[r]; + } + } + + /* If we needed the temporary register info, free the space now. */ + free (regs.start); + free (regs.end); + } + + /* We want zero return to mean success, unlike `re_search'. */ + return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; +} + + +/* Returns a message corresponding to an error code, ERRCODE, returned + from either regcomp or regexec. We don't use PREG here. */ + +size_t +regerror (errcode_v, preg, errbuf, errbuf_size) + int errcode_v; + const regex_t *preg; + char *errbuf; + size_t errbuf_size; +{ + const char *msg; + size_t msg_size; + + if (errcode_v < 0 + || errcode_v >= (sizeof (re_error_msg) / sizeof (re_error_msg[0]))) + /* Only error codes returned by the rest of the code should be passed + to this routine. If we are given anything else, or if other regex + code generates an invalid error code, then the program has a bug. + Dump core so we can fix it. */ + abort (); + + msg = re_error_msg[errcode_v]; + + /* POSIX doesn't require that we do anything in this case, but why + not be nice. */ + if (! msg) + msg = "Success"; + + msg_size = strlen (msg) + 1; /* Includes the null. */ + + if (errbuf_size != 0) + { + if (msg_size > errbuf_size) + { + strncpy (errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = 0; + } + else + strcpy (errbuf, msg); + } + + return msg_size; +} + + +/* Free dynamically allocated space used by PREG. */ + +void +regfree (preg) + regex_t *preg; +{ + if (preg->buffer != NULL) + free (preg->buffer); + preg->buffer = NULL; + + preg->allocated = 0; + preg->used = 0; + + if (preg->fastmap != NULL) + free (preg->fastmap); + preg->fastmap = NULL; + preg->fastmap_accurate = 0; + + if (preg->translate != NULL) + free (preg->translate); + preg->translate = NULL; +} + +#endif /* not emacs */ + +/* +Local variables: +make-backup-files: t +version-control: t +trim-versions-without-asking: nil +End: +*/ Index: branches/apertium-tagger/apertium2/apertium/win32/regex.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/regex.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/regex.h (revision 69632) @@ -0,0 +1,498 @@ +/* Definitions for data structures and routines for the regular + expression library, version 0.12. + + Copyright (C) 1985, 1989, 1990, 1991, 1992, 1993 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#ifndef __REGEXP_LIBRARY_H__ +#define __REGEXP_LIBRARY_H__ + +#ifdef __cplusplus + extern "C" { +#endif + +/* POSIX says that must be included (by the caller) before + . */ + +#ifdef VMS +/* VMS doesn't have `size_t' in , even though POSIX says it + should be there. */ +#include +#endif + + +/* The following bits are used to determine the regexp syntax we + recognize. The set/not-set meanings are chosen so that Emacs syntax + remains the value 0. The bits are given in alphabetical order, and + the definitions shifted by one from the previous bit; thus, when we + add or remove a bit, only one other definition need change. */ +typedef unsigned reg_syntax_t; + +/* If this bit is not set, then \ inside a bracket expression is literal. + If set, then such a \ quotes the following character. */ +#define RE_BACKSLASH_ESCAPE_IN_LISTS (1) + +/* If this bit is not set, then + and ? are operators, and \+ and \? are + literals. + If set, then \+ and \? are operators and + and ? are literals. */ +#define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) + +/* If this bit is set, then character classes are supported. They are: + [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], + [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. + If not set, then character classes are not supported. */ +#define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) + +/* If this bit is set, then ^ and $ are always anchors (outside bracket + expressions, of course). + If this bit is not set, then it depends: + ^ is an anchor if it is at the beginning of a regular + expression or after an open-group or an alternation operator; + $ is an anchor if it is at the end of a regular expression, or + before a close-group or an alternation operator. + + This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because + POSIX draft 11.2 says that * etc. in leading positions is undefined. + We already implemented a previous draft which made those constructs + invalid, though, so we haven't changed the code back. */ +#define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) + +/* If this bit is set, then special characters are always special + regardless of where they are in the pattern. + If this bit is not set, then special characters are special only in + some contexts; otherwise they are ordinary. Specifically, + * + ? and intervals are only special when not after the beginning, + open-group, or alternation operator. */ +#define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) + +/* If this bit is set, then *, +, ?, and { cannot be first in an re or + immediately after an alternation or begin-group operator. */ +#define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) + +/* If this bit is set, then . matches newline. + If not set, then it doesn't. */ +#define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) + +/* If this bit is set, then . doesn't match NUL. + If not set, then it does. */ +#define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) + +/* If this bit is set, nonmatching lists [^...] do not match newline. + If not set, they do. */ +#define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) + +/* If this bit is set, either \{...\} or {...} defines an + interval, depending on RE_NO_BK_BRACES. + If not set, \{, \}, {, and } are literals. */ +#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) + +/* If this bit is set, +, ? and | aren't recognized as operators. + If not set, they are. */ +#define RE_LIMITED_OPS (RE_INTERVALS << 1) + +/* If this bit is set, newline is an alternation operator. + If not set, newline is literal. */ +#define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) + +/* If this bit is set, then `{...}' defines an interval, and \{ and \} + are literals. + If not set, then `\{...\}' defines an interval. */ +#define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) + +/* If this bit is set, (...) defines a group, and \( and \) are literals. + If not set, \(...\) defines a group, and ( and ) are literals. */ +#define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) + +/* If this bit is set, then \ matches . + If not set, then \ is a back-reference. */ +#define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) + +/* If this bit is set, then | is an alternation operator, and \| is literal. + If not set, then \| is an alternation operator, and | is literal. */ +#define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) + +/* If this bit is set, then an ending range point collating higher + than the starting range point, as in [z-a], is invalid. + If not set, then when ending range point collates higher than the + starting range point, the range is ignored. */ +#define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) + +/* If this bit is set, then an unmatched ) is ordinary. + If not set, then an unmatched ) is invalid. */ +#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) + +/* This global variable defines the particular regexp syntax to use (for + some interfaces). When a regexp is compiled, the syntax used is + stored in the pattern buffer, so changing this does not affect + already-compiled regexps. */ +extern reg_syntax_t re_syntax_options; + +/* Define combinations of the above bits for the standard possibilities. + (The [[[ comments delimit what gets put into the Texinfo file, so + don't delete them!) */ +/* [[[begin syntaxes]]] */ +#define RE_SYNTAX_EMACS 0 + +#define RE_SYNTAX_AWK \ + (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +#define RE_SYNTAX_POSIX_AWK \ + (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) + +#define RE_SYNTAX_GREP \ + (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ + | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ + | RE_NEWLINE_ALT) + +#define RE_SYNTAX_EGREP \ + (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ + | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ + | RE_NO_BK_VBAR) + +#define RE_SYNTAX_POSIX_EGREP \ + (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) + +/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ +#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC + +#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC + +/* Syntax bits common to both basic and extended POSIX regex syntax. */ +#define _RE_SYNTAX_POSIX_COMMON \ + (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ + | RE_INTERVALS | RE_NO_EMPTY_RANGES) + +#define RE_SYNTAX_POSIX_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) + +/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes + RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this + isn't minimal, since other operators, such as \`, aren't disabled. */ +#define RE_SYNTAX_POSIX_MINIMAL_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) + +#define RE_SYNTAX_POSIX_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS + replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ +#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) +/* [[[end syntaxes]]] */ + +/* Maximum number of duplicates an interval can allow. Some systems + (erroneously) define this in other header files, but we want our + value, so remove any previous define. */ +#ifdef RE_DUP_MAX +#undef RE_DUP_MAX +#endif +#define RE_DUP_MAX ((1 << 15) - 1) + + +/* POSIX `cflags' bits (i.e., information for `regcomp'). */ + +/* If this bit is set, then use extended regular expression syntax. + If not set, then use basic regular expression syntax. */ +#define REG_EXTENDED 1 + +/* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ +#define REG_ICASE (REG_EXTENDED << 1) + +/* If this bit is set, then anchors do not match at newline + characters in the string. + If not set, then anchors do match at newlines. */ +#define REG_NEWLINE (REG_ICASE << 1) + +/* If this bit is set, then report only success or fail in regexec. + If not set, then returns differ between not matching and errors. */ +#define REG_NOSUB (REG_NEWLINE << 1) + + +/* POSIX `eflags' bits (i.e., information for regexec). */ + +/* If this bit is set, then the beginning-of-line operator doesn't match + the beginning of the string (presumably because it's not the + beginning of a line). + If not set, then the beginning-of-line operator does match the + beginning of the string. */ +#define REG_NOTBOL 1 + +/* Like REG_NOTBOL, except for the end-of-line. */ +#define REG_NOTEOL (1 << 1) + + +/* If any error codes are removed, changed, or added, update the + `re_error_msg' table in regex.c. */ +typedef enum +{ + REG_NOERROR = 0, /* Success. */ + REG_NOMATCH, /* Didn't find a match (for regexec). */ + + /* POSIX regcomp return error codes. (In the order listed in the + standard.) */ + REG_BADPAT, /* Invalid pattern. */ + REG_ECOLLATE, /* Not implemented. */ + REG_ECTYPE, /* Invalid character class name. */ + REG_EESCAPE, /* Trailing backslash. */ + REG_ESUBREG, /* Invalid back reference. */ + REG_EBRACK, /* Unmatched left bracket. */ + REG_EPAREN, /* Parenthesis imbalance. */ + REG_EBRACE, /* Unmatched \{. */ + REG_BADBR, /* Invalid contents of \{\}. */ + REG_ERANGE, /* Invalid range end. */ + REG_ESPACE, /* Ran out of memory. */ + REG_BADRPT, /* No preceding re for repetition op. */ + + /* Error codes we've added. */ + REG_EEND, /* Premature end. */ + REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ + REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ +} reg_errcode_t; + +/* This data structure represents a compiled pattern. Before calling + the pattern compiler, the fields `buffer', `allocated', `fastmap', + `translate', and `no_sub' can be set. After the pattern has been + compiled, the `re_nsub' field is available. All other fields are + private to the regex routines. */ + +struct re_pattern_buffer +{ +/* [[[begin pattern_buffer]]] */ + /* Space that holds the compiled pattern. It is declared as + `unsigned char *' because its elements are + sometimes used as array indexes. */ + unsigned char *buffer; + + /* Number of bytes to which `buffer' points. */ + unsigned long allocated; + + /* Number of bytes actually used in `buffer'. */ + unsigned long used; + + /* Syntax setting with which the pattern was compiled. */ + reg_syntax_t syntax; + + /* Pointer to a fastmap, if any, otherwise zero. re_search uses + the fastmap, if there is one, to skip over impossible + starting points for matches. */ + char *fastmap; + + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation + is applied to a pattern when it is compiled and to a string + when it is matched. */ + char *translate; + + /* Number of subexpressions found by the compiler. */ + size_t re_nsub; + + /* Zero if this pattern cannot match the empty string, one else. + Well, in truth it's used only in `re_search_2', to see + whether or not we should use the fastmap, so we don't set + this absolutely perfectly; see `re_compile_fastmap' (the + `duplicate' case). */ + unsigned can_be_null : 1; + + /* If REGS_UNALLOCATED, allocate space in the `regs' structure + for `max (RE_NREGS, re_nsub + 1)' groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ +#define REGS_UNALLOCATED 0 +#define REGS_REALLOCATE 1 +#define REGS_FIXED 2 + unsigned regs_allocated : 2; + + /* Set to zero when `regex_compile' compiles a pattern; set to one + by `re_compile_fastmap' if it updates the fastmap. */ + unsigned fastmap_accurate : 1; + + /* If set, `re_match_2' does not return information about + subexpressions. */ + unsigned no_sub : 1; + + /* If set, a beginning-of-line anchor doesn't match at the + beginning of the string. */ + unsigned not_bol : 1; + + /* Similarly for an end-of-line anchor. */ + unsigned not_eol : 1; + + /* If true, an anchor at a newline matches. */ + unsigned newline_anchor : 1; + +/* [[[end pattern_buffer]]] */ +}; + +typedef struct re_pattern_buffer regex_t; + + +/* search.c (search_buffer) in Emacs needs this one opcode value. It is + defined both in `regex.c' and here. */ +#define RE_EXACTN_VALUE 1 + +/* Type for byte offsets within the string. POSIX mandates this. */ +typedef int regoff_t; + + +/* This is the structure we store register match data in. See + regex.texinfo for a full description of what registers match. */ +struct re_registers +{ + unsigned num_regs; + regoff_t *start; + regoff_t *end; +}; + + +/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, + `re_match_2' returns information about at least this many registers + the first time a `regs' structure is passed. */ +#ifndef RE_NREGS +#define RE_NREGS 30 +#endif + + +/* POSIX specification for registers. Aside from the different names than + `re_registers', POSIX uses an array of structures, instead of a + structure of arrays. */ +typedef struct +{ + regoff_t rm_so; /* Byte offset from string's start to substring's start. */ + regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ +} regmatch_t; + +/* Declarations for routines. */ + +/* To avoid duplicating every routine declaration -- once with a + prototype (if we are ANSI), and once without (if we aren't) -- we + use the following macro to declare argument types. This + unfortunately clutters up the declarations a bit, but I think it's + worth it. */ + +#if __STDC__ + +#define _RE_ARGS(args) args + +#else /* not __STDC__ */ + +#define _RE_ARGS(args) () + +#endif /* not __STDC__ */ + +/* Sets the current default syntax to SYNTAX, and return the old syntax. + You can also simply assign to the `re_syntax_options' variable. */ +extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); + +/* Compile the regular expression PATTERN, with length LENGTH + and syntax given by the global `re_syntax_options', into the buffer + BUFFER. Return NULL if successful, and an error string if not. */ +extern const char *re_compile_pattern + _RE_ARGS ((const char *pattern, int length, + struct re_pattern_buffer *buffer)); + + +/* Compile a fastmap for the compiled pattern in BUFFER; used to + accelerate searches. Return 0 if successful and -2 if was an + internal error. */ +extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); + + +/* Search in the string STRING (with length LENGTH) for the pattern + compiled into BUFFER. Start searching at position START, for RANGE + characters. Return the starting position of the match, -1 for no + match, or -2 for an internal error. Also return register + information in REGS (if REGS and BUFFER->no_sub are nonzero). */ +extern int re_search + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, int range, struct re_registers *regs)); + + +/* Like `re_search', but search in the concatenation of STRING1 and + STRING2. Also, stop searching at index START + STOP. */ +extern int re_search_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, int range, struct re_registers *regs, int stop)); + + +/* Like `re_search', but return how many characters in STRING the regexp + in BUFFER matched, starting at position START. */ +extern int re_match + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, struct re_registers *regs)); + + +/* Relates to `re_match' as `re_search_2' relates to `re_search'. */ +extern int re_match_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, struct re_registers *regs, int stop)); + + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using BUFFER and REGS will use this memory + for recording register information. STARTS and ENDS must be + allocated with malloc, and must each be at least `NUM_REGS * sizeof + (regoff_t)' bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ +extern void re_set_registers + _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, + unsigned num_regs, regoff_t *starts, regoff_t *ends)); + +/* 4.2 bsd compatibility. */ +extern char *re_comp _RE_ARGS ((const char *)); +extern int re_exec _RE_ARGS ((const char *)); + +/* POSIX compatibility. */ +extern int regcomp _RE_ARGS ((regex_t *preg, const char *pattern, int cflags)); +extern int regexec + _RE_ARGS ((const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags)); +extern size_t regerror + _RE_ARGS ((int errcode, const regex_t *preg, char *errbuf, + size_t errbuf_size)); +extern void regfree _RE_ARGS ((regex_t *preg)); + +#ifdef __cplusplus + } +#endif + +#endif /* not __REGEXP_LIBRARY_H__ */ + +/* +Local variables: +make-backup-files: t +version-control: t +trim-versions-without-asking: nil +End: +*/ Index: branches/apertium-tagger/apertium2/apertium/win32/runner_skeleton.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/runner_skeleton.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/runner_skeleton.c (revision 69632) @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include +#include +#include + +#define PATH_BUF_SIZE 8191 +#define NUM_EXEC_ARGS 3 +#define ARG_BUF_SIZE (8191 - NUM_EXEC_ARGS) +#define ENV_VAR_SIZE 32768 + +/* Strip the last component off a pathname. + Thus, parent("a\b\c") -> "a\b" */ +char* parent(char* parent_buf) { + char* pos = strrchr(parent_buf, '\\'); + pos[0] = '\0'; + + return parent_buf; +} + +/* Remove the .exe if the user invoked this executable with its extension. + That is, if the user typed something like apertium.exe instead of apertium. */ +char* remove_extension(char* buf) { + char* pos = strrchr(buf, '.'); + + if (pos != NULL && strcmp(pos, ".exe") == 0) { + pos[0] = '\0'; + } + + return buf; +} + +#define MIN(x, y) ((x) < (y) ? x : y) + +int main(int argc, char* argv[]) { + char *args[ARG_BUF_SIZE]; + char base_path[PATH_BUF_SIZE + 1]; + char script_path[PATH_BUF_SIZE + 1]; + char shell_path[PATH_BUF_SIZE + 1]; + char env_path[ENV_VAR_SIZE]; + int argi; + + _fullpath(shell_path, argv[0], PATH_BUF_SIZE); + strcpy(script_path, shell_path); + strcpy(base_path, shell_path); + + parent(shell_path); + strcat(shell_path, "\\sh.exe"); + + remove_extension(script_path); + parent(base_path); + + args[0] = shell_path; + args[1] = "--norc"; + args[2] = script_path; + + /* Any parameters passed on the command line will be passed through to the shell script */ + for (argi = 0; argi < MIN(argc - 1, ARG_BUF_SIZE); argi++) { + printf("%s\n", argv[argi + 1]); + args[argi + NUM_EXEC_ARGS] = argv[argi + 1]; + } + /* Signal the end of the argument list */ + args[argi + NUM_EXEC_ARGS] = NULL; + + /* Add this executable's directory to the path */ + strcpy(env_path, "PATH="); + strcat(env_path, getenv("PATH")); + strcat(env_path, ";"); + strcat(env_path, base_path); + _putenv(env_path); + + _spawnv(_P_WAIT, args[0], &args[1]); + + _flushall(); +} Index: branches/apertium-tagger/apertium2/apertium/serialiser.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/serialiser.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/serialiser.h (revision 69632) @@ -0,0 +1,288 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef SERIALISER_H +#define SERIALISER_H + +#include "a.h" +#include "basic_exception_type.h" +#include "analysis.h" +#include "exception.h" +#include "i.h" +#include "lemma.h" +#include "morpheme.h" +#include "tag.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +namespace { +template +static unsigned char compressedSize(const SerialisedType &SerialisedType_) { + unsigned char compressedSize_ = 0; + + for (; (SerialisedType_ >> + std::numeric_limits::digits * compressedSize_) != 0; + ++compressedSize_) { + } + + return compressedSize_; +} + +template class Serialiser; + +template <> class Serialiser { +public: + inline static void serialise(const a &SerialisedType_, std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const Analysis &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const i &SerialisedType_, std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const Lemma &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const Morpheme &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const Tag &SerialisedType_, + std::ostream &Output); +}; + +template +class Serialiser > { +public: + inline static void + serialise(const std::basic_string &SerialisedType_, + std::ostream &Output); +}; + +template +class Serialiser > { +public: + inline static void + serialise(const std::map &SerialisedType_, + std::ostream &Output); +}; + +template +class Serialiser > { +public: + inline static void + serialise(const std::pair &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const std::size_t &SerialisedType_, + std::ostream &Output); +}; + +template class Serialiser > { +public: + inline static void serialise(const std::vector &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const wchar_t &SerialisedType_, + std::ostream &Output); +}; +} + +template +inline void serialise(const SerialisedType &SerialisedType_, + std::ostream &Output) { + Serialiser::serialise(SerialisedType_, Output); +} + +void Serialiser::serialise(const a &SerialisedType_, std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheTags, Output); + ::Apertium::serialise(SerialisedType_.TheMorphemes, Output); +} + +void Serialiser::serialise(const Analysis &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheMorphemes, Output); +} + +void Serialiser::serialise(const i &SerialisedType_, std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheTags, Output); +} + +void Serialiser::serialise(const Lemma &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheLemma, Output); +} + +void Serialiser::serialise(const Morpheme &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheLemma, Output); + ::Apertium::serialise(SerialisedType_.TheTags, Output); +} + +void Serialiser::serialise(const Tag &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheTag, Output); +} + +template +void Serialiser >::serialise( + const std::basic_string &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.size(), Output); + + for (typename std::basic_string::const_iterator + SerialisedType_iterator = SerialisedType_.begin(); + // Call .end() each iteration to save memory. + SerialisedType_iterator != SerialisedType_.end(); + ++SerialisedType_iterator) { + ::Apertium::serialise(*SerialisedType_iterator, Output); + } +} + +template +void Serialiser >::serialise( + const std::map &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.size(), Output); + + for (typename std::map::const_iterator + SerialisedType_iterator = SerialisedType_.begin(); + // Call .end() each iteration to save memory. + SerialisedType_iterator != SerialisedType_.end(); + ++SerialisedType_iterator) { + ::Apertium::serialise(*SerialisedType_iterator, Output); + } +} + +template +void Serialiser >::serialise( + const std::pair &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.first, Output); + ::Apertium::serialise(SerialisedType_.second, Output); +} + +void Serialiser::serialise(const std::size_t &SerialisedType_, + std::ostream &Output) { + try { + Output.put(compressedSize(SerialisedType_)); + + if (!Output) { + std::stringstream what_; + what_ << "can't serialise size " << std::hex + << /* [1] */ +compressedSize(SerialisedType_) << std::dec; + throw Exception::Serialiser::not_Stream_good(what_); + } + + for (unsigned char CompressedSize = compressedSize(SerialisedType_); + CompressedSize != 0; Output.put(static_cast( + SerialisedType_ >> + std::numeric_limits::digits * --CompressedSize))) { + if (!Output) { + std::stringstream what_; + what_ << "can't serialise byte " << std::hex + << /* [1] */ +static_cast( + SerialisedType_ >> + std::numeric_limits::digits * + CompressedSize) << std::dec; + throw Exception::Serialiser::not_Stream_good(what_); + } + } + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't serialise const std::size_t & : " + << basic_ExceptionType_.what(); + throw Exception::Serialiser::size_t_(what_); + } +} + +template +void Serialiser >::serialise( + const std::vector &SerialisedType_, std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.size(), Output); + + for (typename std::vector::const_iterator value_type_ = + SerialisedType_.begin(); + // Call .end() each iteration to save memory. + value_type_ != SerialisedType_.end(); ++value_type_) { + ::Apertium::serialise(*value_type_, Output); + } +} + +void Serialiser::serialise(const wchar_t &SerialisedType_, + std::ostream &Output) { + try { + Output.put(compressedSize(SerialisedType_)); + + if (!Output) { + std::stringstream what_; + what_ << "can't serialise size " << std::hex + << /* [1] */ +compressedSize(SerialisedType_); + throw Exception::Serialiser::not_Stream_good(what_); + } + + for (unsigned char CompressedSize = compressedSize(SerialisedType_); + CompressedSize != 0; Output.put(static_cast( + static_cast(SerialisedType_) >> + std::numeric_limits::digits * --CompressedSize))) { + if (!Output) { + std::stringstream what_; + what_ << "can't serialise byte " << std::hex + << /* [1] */ +(static_cast(SerialisedType_) >> + std::numeric_limits::digits * + CompressedSize); + throw Exception::Serialiser::not_Stream_good(what_); + } + } + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't serialise const wchar_t & : " + << basic_ExceptionType_.what(); + throw Exception::Serialiser::wchar_t_(what_); + } +} +} + +// [1] operator+ promotes its operand to a printable integral type. + +#endif // SERIALISER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.cc (revision 69632) @@ -0,0 +1,50 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_1_tagger_trainer.h" + +#include "analysis.h" +#include "basic_tagger.h" +#include "serialiser.h" + +#include +#include +#include +#include + +namespace Apertium { +Stream_5_3_1_TaggerTrainer::Stream_5_3_1_TaggerTrainer( + const basic_Tagger::Flags &Flags_) + : basic_5_3_1_Tagger(), basic_StreamTaggerTrainer(Flags_) {} + +void Stream_5_3_1_TaggerTrainer::serialise( + std::ostream &Serialised_basic_Tagger) const { + ::Apertium::serialise(Model, Serialised_basic_Tagger); +} + +void +Stream_5_3_1_TaggerTrainer::train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_) { + Model.insert(std::make_pair(Analysis_, 0)).first->second += Coefficient_; +} + +void Stream_5_3_1_TaggerTrainer::multiplyModel( + const std::size_t &OccurrenceCoefficientMultiplier) { + for (std::map::iterator Analysis_ = Model.begin(); + Analysis_ != Model.end(); ++Analysis_) { + Analysis_->second *= OccurrenceCoefficientMultiplier; + } +} +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.cc (revision 69632) @@ -0,0 +1,55 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_2_tagger_trainer.h" + +#include "a.h" +#include "analysis.h" +#include "lemma.h" +#include "serialiser.h" + +#include +#include +#include + +namespace Apertium { +Stream_5_3_2_TaggerTrainer::Stream_5_3_2_TaggerTrainer(const Flags &Flags_) + : basic_StreamTaggerTrainer(Flags_) {} + +void Stream_5_3_2_TaggerTrainer::serialise( + std::ostream &Serialised_basic_Tagger) const { + ::Apertium::serialise(Model, Serialised_basic_Tagger); +} + +void +Stream_5_3_2_TaggerTrainer::train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_) { + Model.insert(std::make_pair(static_cast(Analysis_), + std::map())) + .first->second.insert(std::make_pair(static_cast(Analysis_), 0)) + .first->second += Coefficient_; +} + +void Stream_5_3_2_TaggerTrainer::multiplyModel( + const std::size_t &OccurrenceCoefficientMultiplier) { + for (std::map >::iterator a_ = Model.begin(); + a_ != Model.end(); ++a_) { + for (std::map::iterator r_ = a_->second.begin(); + r_ != a_->second.end(); ++r_) { + r_->second *= OccurrenceCoefficientMultiplier; + } + } +} +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.cc (revision 69632) @@ -0,0 +1,88 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "analysis.h" +#include "i.h" +#include "lemma.h" +#include "serialiser.h" +#include "stream_5_3_3_tagger_trainer.h" + +#include +#include +#include +#include +#include + +namespace Apertium { +Stream_5_3_3_TaggerTrainer::Stream_5_3_3_TaggerTrainer(const Flags &Flags_) + : basic_StreamTaggerTrainer(Flags_) {} + +void Stream_5_3_3_TaggerTrainer::serialise( + std::ostream &Serialised_basic_Tagger) const { + ::Apertium::serialise(Model, Serialised_basic_Tagger); +} + +void +Stream_5_3_3_TaggerTrainer::train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_) { + Model.first.insert( + std::make_pair(i(Analysis_), std::map())) + .first->second.insert(std::make_pair(Lemma(Analysis_), 0)) + .first->second += Coefficient_; + + for (std::vector::const_iterator Morpheme_ = + Analysis_.TheMorphemes.begin() + 1; + Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) { + Model.second.first.insert(std::make_pair(i(*(Morpheme_ - 1)), + std::map())) + .first->second.insert(std::make_pair(Lemma(*Morpheme_), 0)) + .first->second += Coefficient_; + Model.second.second.insert(std::make_pair(Lemma(*Morpheme_), + std::map())) + .first->second.insert(std::make_pair(i(*Morpheme_), 0)) + .first->second += Coefficient_; + } +} + +void Stream_5_3_3_TaggerTrainer::multiplyModel( + const std::size_t &OccurrenceCoefficientMultiplier) { + for (std::map >::iterator i_ = + Model.first.begin(); + i_ != Model.first.end(); ++i_) { + for (std::map::iterator Lemma_ = i_->second.begin(); + Lemma_ != i_->second.end(); ++Lemma_) { + Lemma_->second *= OccurrenceCoefficientMultiplier; + } + } + + for (std::map >::iterator i_ = + Model.second.first.begin(); + i_ != Model.second.first.end(); ++i_) { + for (std::map::iterator Lemma_ = i_->second.begin(); + Lemma_ != i_->second.end(); ++Lemma_) { + Lemma_->second *= OccurrenceCoefficientMultiplier; + } + } + + for (std::map >::iterator Lemma_ = + Model.second.second.begin(); + Lemma_ != Model.second.second.end(); ++Lemma_) { + for (std::map::iterator i_ = Lemma_->second.begin(); + i_ != Lemma_->second.end(); ++i_) { + i_->second *= OccurrenceCoefficientMultiplier; + } + } +} +} Index: branches/apertium-tagger/apertium2/apertium/Makefile.am =================================================================== --- branches/apertium-tagger/apertium2/apertium/Makefile.am (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/Makefile.am (revision 69632) @@ -0,0 +1,651 @@ +AUTOMAKE_OPTIONS = subdir-objects + +h_sources = a.h \ + align.h \ + analysis.h \ + apertium_re.h \ + apertium_tagger.h \ + basic_5_3_1_tagger.h \ + basic_5_3_2_tagger.h \ + basic_5_3_3_tagger.h \ + basic_exception_type.h \ + basic_stream_tagger.h \ + basic_stream_tagger_trainer.h \ + basic_tagger.h \ + collection.h \ + constant_manager.h \ + constructor_eq_delete.h \ + deserialiser.h \ + endian_double_util.h \ + err_exception.h \ + exception.h \ + exception_type.h \ + file_tagger.h \ + hmm.h \ + i.h \ + interchunk.h \ + interchunk_word.h \ + latex_accentsmap.h \ + lemma.h \ + lexical_unit.h \ + linebreak.h \ + lswpost.h \ + morpheme.h \ + morpho_stream.h \ + optional.h \ + postchunk.h \ + serialiser.h \ + stream.h \ + stream_5_3_1_tagger.h \ + stream_5_3_2_tagger.h \ + stream_5_3_3_tagger.h \ + stream_5_3_1_tagger_trainer.h \ + stream_5_3_2_tagger_trainer.h \ + stream_5_3_3_tagger_trainer.h \ + streamed_type.h \ + string_utils.h \ + tag.h \ + tagger_data.h \ + tagger_data_hmm.h \ + tagger_data_lsw.h \ + tagger_utils.h \ + tagger_word.h \ + tmx_aligner_tool.h \ + tmx_alignment.h \ + tmx_align_parameters.h \ + tmx_arguments_parser.h \ + tmx_book_to_matrix.h \ + tmx_builder.h \ + tmx_dictionary.h \ + tmx_dic_tree.h \ + tmx_quasi_diagonal.h \ + tmx_serialize_impl.h \ + tmx_strings_and_streams.h \ + tmx_trail_postprocessors.h \ + tmx_translate.h \ + tmx_words.h \ + transfer_data.h \ + transfer.h \ + transfer_instr.h \ + transfer_mult.h \ + transfer_token.h \ + transfer_word.h \ + transfer_word_list.h \ + trx_reader.h \ + tsx_reader.h \ + ttag.h \ + unlocked_cstdio.h \ + utf_converter.h \ + wchar_t_exception.h \ + wchar_t_exception_type.h + +#DEPR.: +# lextor_data.h +# lextor_eval.h +# lextor.h +# lextor_word.h + +cc_sources = a.cc \ + align.cc \ + analysis.cc \ + apertium_re.cc \ + basic_5_3_1_tagger.cc \ + basic_5_3_2_tagger.cc \ + basic_exception_type.cc \ + basic_stream_tagger.cc \ + basic_stream_tagger_trainer.cc \ + basic_tagger.cc \ + collection.cc \ + constant_manager.cc \ + endian_double_util.cc \ + exception_type.cc \ + file_tagger.cc \ + hmm.cc \ + i.cc \ + interchunk.cc \ + interchunk_word.cc \ + latex_accentsmap.cc \ + lemma.cc \ + linebreak.cc \ + lswpost.cc \ + morpheme.cc \ + morpho_stream.cc \ + postchunk.cc \ + stream.cc \ + stream_5_3_1_tagger.cc \ + stream_5_3_2_tagger.cc \ + stream_5_3_3_tagger.cc \ + stream_5_3_1_tagger_trainer.cc \ + stream_5_3_2_tagger_trainer.cc \ + stream_5_3_3_tagger_trainer.cc \ + string_utils.cc \ + tag.cc \ + tagger_data.cc \ + tagger_data_hmm.cc \ + tagger_data_lsw.cc \ + tagger_utils.cc \ + tagger_word.cc \ + tmx_aligner_tool.cc \ + tmx_alignment.cc \ + tmx_arguments_parser.cc \ + tmx_book_to_matrix.cc \ + tmx_builder.cc \ + tmx_dictionary.cc \ + tmx_strings_and_streams.cc \ + tmx_trail_postprocessors.cc \ + tmx_translate.cc \ + transfer.cc \ + transfer_data.cc \ + transfer_instr.cc \ + transfer_mult.cc \ + transfer_token.cc \ + transfer_word.cc \ + transfer_word_list.cc \ + trx_reader.cc \ + tsx_reader.cc \ + utf_converter.cc \ + wchar_t_exception_type.cc +#DEPR.: +# lextor.cc +# lextor_data.cc +# lextor_eval.cc +# lextor_word.cc + +library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME) +library_include_HEADERS = $(h_sources) + +GENERATEDSCRIPTS = apertium-gen-deformat apertium-gen-reformat \ + apertium-validate-tagger \ + apertium-validate-transfer apertium-validate-dictionary \ + apertium-validate-modes \ + apertium-validate-interchunk \ + apertium-validate-postchunk apertium apertium-unformat \ + apertium-gen-modes apertium-validate-acx \ + apertium-utils-fixlatex +#DEPR.: + #apertium-preprocess-corpus-lextor + #apertium-gen-stopwords-lextor + #apertium-gen-lextorbil + #apertium-gen-lextormono apertium-gen-wlist-lextor + +lib_LTLIBRARIES = libapertium3.la +libapertium3_la_SOURCES = $(h_sources) $(cc_sources) +libapertium3_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION) -release $(GENERIC_RELEASE) + +bin_PROGRAMS = apertium-deshtml \ + apertium-deslatex \ + apertium-desmediawiki \ + apertium-desodt \ + apertium-despptx \ + apertium-desrtf \ + apertium-destxt \ + apertium-deswxml \ + apertium-desxlsx \ + apertium-desxpresstag \ + apertium-filter-ambiguity \ + apertium-interchunk \ + apertium-multiple-translations \ + apertium-postchunk \ + apertium-postlatex \ + apertium-postlatex-raw \ + apertium-prelatex \ + apertium-preprocess-transfer \ + apertium-pretransfer \ + apertium-rehtml \ + apertium-rehtml-noent \ + apertium-relatex \ + apertium-remediawiki \ + apertium-reodt \ + apertium-repptx \ + apertium-rertf \ + apertium-retxt \ + apertium-rewxml \ + apertium-rexlsx \ + apertium-rexpresstag \ + apertium-tagger \ + apertium-tagger-apply-new-rules \ + apertium-tagger-readwords \ + apertium-tmxbuild \ + apertium-transfer + +bin_SCRIPTS = $(GENERATEDSCRIPTS) + +instdir = apertium + +apertiumdir = $(prefix)/share/apertium +apertiuminclude = $(prefix)/include/apertium-$(GENERIC_API_VERSION) +apertiumlib = $(prefix)/lib +apertiumsysconf = $(prefix)/etc/apertium + +apertium_DATA = deformat.xsl reformat.xsl new2old.xsl lexchoice.xsl \ + lexchoicebil.xsl \ + tagger.dtd interchunk.dtd format.dtd transfer.dtd postchunk.dtd modes.dtd \ + tagger.rnc interchunk.rnc format.rnc transfer.rnc postchunk.rnc modes.rnc \ + modes2bash.xsl modes2debugmodes.xsl \ + apertium-createmodes.awk + +apertium_pretransfer_SOURCES = apertium_pretransfer.cc +apertium_multiple_translations_SOURCES = apertium-multiple-translations.cc +apertium_multiple_translations_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) +apertium_destxt_SOURCES = apertium_destxt.cc +apertium_retxt_SOURCES = apertium_retxt.cc +apertium_deshtml_SOURCES = apertium_deshtml.cc +apertium_rehtml_SOURCES = apertium_rehtml.cc +apertium_rehtml_noent_SOURCES = apertium_rehtml_noent.cc +apertium_desxpresstag_SOURCES = apertium_desxpresstag.cc +apertium_rexpresstag_SOURCES = apertium_rexpresstag.cc +apertium_desodt_SOURCES = apertium_desodt.cc +apertium_reodt_SOURCES = apertium_reodt.cc +apertium_desrtf_SOURCES = apertium_desrtf.cc +apertium_rertf_SOURCES = apertium_rertf.cc +apertium_deswxml_SOURCES = apertium_deswxml.cc +apertium_rewxml_SOURCES = apertium_rewxml.cc +apertium_deslatex_SOURCES = apertium_deslatex.cc +apertium_relatex_SOURCES = apertium_relatex.cc +apertium_desxlsx_SOURCES = apertium_desxlsx.cc +apertium_rexlsx_SOURCES = apertium_rexlsx.cc +apertium_despptx_SOURCES = apertium_despptx.cc +apertium_repptx_SOURCES = apertium_repptx.cc +apertium_desmediawiki_SOURCES = apertium_desmediawiki.cc +apertium_remediawiki_SOURCES = apertium_remediawiki.cc +apertium_prelatex_SOURCES = apertium_prelatex.cc +apertium_prelatex_LDADD= $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) +apertium_postlatex_SOURCES = apertium_postlatex.cc +apertium_postlatex_LDADD= $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) +apertium_postlatex_raw_SOURCES = apertium_postlatex_raw.cc +apertium_postlatex_raw_LDADD= $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_tagger_SOURCES = apertium_tagger.cc +apertium_tagger_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_tmxbuild_SOURCES = apertium_tmxbuild.cc +apertium_tmxbuild_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_preprocess_transfer_SOURCES = transferpp.cc +apertium_preprocess_transfer_LDADD = $(APERTIUM_LIBS) \ + -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_filter_ambiguity_SOURCES = apertium_filter_ambiguity.cc +apertium_filter_ambiguity_LDADD = $(APERTIUM_LIBS) \ + -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_transfer_SOURCES = apertium_transfer.cc +apertium_transfer_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_interchunk_SOURCES = apertium_interchunk.cc +apertium_interchunk_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_postchunk_SOURCES = apertium_postchunk.cc +apertium_postchunk_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +###apertium_lextor_SOURCES = apertium_lextor.cc +###apertium_lextor_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +#apertium_lextor_eval_SOURCES = apertium-lextor-eval.C +#apertium_lextor_eval_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_tagger_apply_new_rules_SOURCES = apertium_tagger_apply_new_rules.cc +apertium_tagger_apply_new_rules_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_tagger_readwords_SOURCES = apertium_tagger_readwords.cc +apertium_tagger_readwords_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +###apertium_lextor_search_SOURCES = apertium-lextor-search.C +###apertium_lextor_search_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +###pruebas_lextor_SOURCES = pruebas-lextor.C +###pruebas_lextor_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +###apertium_gen_wlist_lextor_translation_SOURCES = apertium_gen_wlist_lextor_translation.cc +###apertium_gen_wlist_lextor_translation_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + + +if WINDOWS +INCLUDES = -I$(top_srcdir)/apertium/win32 -I$(top_srcdir) $(APERTIUM_CFLAGS) +else +INCLUDES = -I$(top_srcdir) $(APERTIUM_CFLAGS) +endif +CLEANFILES = *~ apertium_destxt.cc apertium_retxt.cc apertium_deshtml.cc \ + apertium_rehtml.cc apertium_desrtf.cc apertium_rertf.cc \ + apertium_rehtml_noent.cc \ + apertium_deswxml.cc apertium_rewxml.cc \ + apertium_deslatex.cc apertium_relatex.cc \ + apertium_desxlsx.cc apertium_rexlsx.cc \ + apertium_despptx.cc apertium_repptx.cc \ + apertium_desodt.cc apertium_reodt.cc \ + apertium_desxpresstag.cc apertium_rexpresstag.cc \ + apertium_desmediawiki.cc apertium_remediawiki.cc \ + apertium_prelatex.cc apertium_postlatex.cc \ + $(GENERATEDSCRIPTS) + +apertium_destxt.cc: txt-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl txt-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_desxpresstag.cc: xpresstag-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl xpresstag-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rexpresstag.cc: xpresstag-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl xpresstag-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_retxt.cc: txt-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl txt-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_deshtml.cc: html-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl html-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rehtml.cc: html-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl html-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rehtml_noent.cc: html-noent-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl html-noent-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_desodt.cc: odt-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl odt-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_reodt.cc: odt-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl odt-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_desrtf.cc: rtf-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl rtf-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rertf.cc: rtf-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl rtf-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_deswxml.cc: wxml-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl wxml-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rewxml.cc: wxml-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl wxml-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_deslatex.cc: latex-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl latex-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_relatex.cc: latex-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl latex-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + + + +apertium_desxlsx.cc: xlsx-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl xlsx-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rexlsx.cc: xlsx-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl xlsx-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_despptx.cc: pptx-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl pptx-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_repptx.cc: pptx-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl pptx-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_desmediawiki.cc: mediawiki-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl mediawiki-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_remediawiki.cc: mediawiki-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl mediawiki-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_prelatex.cc: apertium-prelatex.l + $(FLEX) -Cfer -o$@ apertium-prelatex.l + +apertium_postlatex.cc: apertium-postlatex.l + $(FLEX) -Cfer -o$@ apertium-postlatex.l + +apertium_postlatex_raw.cc: apertium-postlatex-raw.l + $(FLEX) -Cfer -o$@ apertium-postlatex-raw.l + +apertium-validate-tagger: Makefile.am validate-header.sh + @echo "Creating apertium-validate-tagger script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/tagger.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-transfer: Makefile.am validate-header.sh + @echo "Creating apertium-validate-transfer script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/transfer.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-interchunk: Makefile.am validate-header.sh + @echo "Creating apertium-validate-interchunk script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/interchunk.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-postchunk: Makefile.am validate-header.sh + @echo "Creating apertium-validate-postchunk script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/postchunk.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-acx: Makefile.am validate-header.sh + @echo "Creating apertium-validate-acx script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --relaxng \"$(prefix)\"/share/lttoolbox/acx.rng --schema \"$(prefix)\"/share/lttoolbox/acx.xsd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-modes: Makefile.am validate-header.sh + @echo "Creating apertium-validate-modes script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/modes.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + + +apertium-validate-dictionary: Makefile.am validate-header.sh + @echo "Creating apertium-validate-dictionary script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "# xsd is a non-final command, so just treated as a warning when compiling:" >> $@ + @echo "$(XMLLINT) --schema \"$(prefix)\"/share/lttoolbox/dix.xsd --noout \"\$$FILE1\" | grep -vF ' fails to validate'" >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(prefix)\"/share/lttoolbox/dix.dtd --noout \"\$$FILE1\"" >> $@ + @chmod a+x $@ + +apertium-gen-deformat: Makefile.am deformat-header.sh + @echo "Creating apertium-gen-deformat script" + @echo "#!$(BASH)" > $@ + @cat deformat-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/format.dtd --noout \$$FILE1 && \\" >> $@ + @if [ `basename $(XSLTPROC)` == xsltproc ]; \ + then echo "$(XSLTPROC) --stringparam mode \$$MODE \"$(apertiumdir)\"/deformat.xsl \$$FILE1 >/tmp/\$$\$$.deformat.l && \\"; \ + else echo "$(XSLTPROC) \"$(apertiumdir)\"/deformat.xsl \$$FILE1 \"\\\$$mode=\$$MODE\" >/tmp/\$$\$$.deformat.l && \\"; \ + fi >> $@ + @echo "$(FLEX) \$$FLEXOPTS -o/tmp/\$$\$$.lex.cc /tmp/\$$\$$.deformat.l && \\" >> $@ + @echo "$(CXX) -DGENFORMAT $(CXXFLAGS) -w $(APERTIUM_CFLAGS) -I $(apertiuminclude) -o \$$FILE2 /tmp/\$$\$$.lex.cc $(APERTIUM_LIBS) 2>/dev/null && \\" >> $@ + @echo "rm /tmp/\$$\$$.deformat.l /tmp/\$$\$$.lex.cc" >> $@ + @chmod a+x $@ + +apertium-gen-reformat: Makefile.am gen-header.sh + @echo "Creating apertium-gen-reformat script" + @echo "#!$(BASH)" > $@ + @cat gen-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/format.dtd --noout \$$FILE1 && \\" >> $@ + @echo "$(XSLTPROC) \"$(apertiumdir)\"/reformat.xsl \$$FILE1 >/tmp/\$$\$$.reformat.l && \\" >> $@ + @echo "$(FLEX) \$$FLEXOPTS -o/tmp/\$$\$$.lex.cc /tmp/\$$\$$.reformat.l && \\" >> $@ + @echo "$(CXX) -DGENFORMAT $(CXXFLAGS) -w $(APERTIUM_CFLAGS) -I $(apertiuminclude) -o \$$FILE2 /tmp/\$$\$$.lex.cc $(APERTIUM_LIBS) 2>/dev/null &&\\" >> $@ + @echo "rm /tmp/\$$\$$.reformat.l /tmp/\$$\$$.lex.cc" >> $@ + @chmod a+x $@ + +apertium-gen-modes: apertium-gen-modes.in Makefile.am + @echo "#!$(BASH)" > $@ + @echo "APERTIUMDIR=$(apertiumdir)" >> $@ + @echo "XMLLINT=$(XMLLINT)" >> $@ + @echo "XSLTPROC=$(XSLTPROC)" >> $@ + @cat $< >> $@ + @chmod a+x $@ + +apertium-utils-fixlatex: Makefile.am utils-fixlatex-header.sh + @echo "Creating apertium-utils-fixlatex script" + @echo "#!$(BASH)" > $@ + @cat utils-fixlatex-header.sh >> $@ + @chmod a+x $@ + +apertium: Makefile.am apertium-header.sh + @echo "Creating apertium script" + @echo "#!$(BASH)" > $@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "DEFAULT_DIRECTORY=\"$(prefix)/share/apertium\"" >>$@ + @cat apertium-header.sh >>$@ + @chmod a+x $@ + +apertium-unformat: Makefile.am apertium-unformat-header.sh + @echo "Creating apertium-unformat script" + @echo "#!$(BASH)" > $@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "DEFAULT_DIRECTORY=\"$(prefix)/share/apertium\"" >>$@ + @cat apertium-unformat-header.sh >>$@ + @chmod a+x $@ + + +#apertium-translator-lextor: Makefile.am trans-lextor-header.sh +# @echo "Creating apertium-translator-lextor script" +# @echo "#!$(BASH)" > $@ +# @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ +# @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ +# @cat trans-lextor-header.sh >>$@ +# @chmod a+x $@ + +#apertium-gen-oldbil: Makefile.am transformdicbil-header.sh +# @echo "Creating apertium-gen-oldbil script" +# @echo "#!$(BASH)" >$@ +# @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ +# @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ +# @echo "XSLTPROC_OPTIONS=\"\"">>$@ +# @echo "STYLESHEET=\"$(apertiumdir)/new2old.xsl\"">>$@ +# @cat transformdicbil-header.sh >>$@ +# @chmod a+x $@ + +apertium-gen-lextorbil: Makefile.am transformdic-header.sh + @echo "Creating apertium-gen-lextorbil script" + @echo "#!$(BASH)" >$@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "XSLTPROC_OPTIONS_LR=\"\"">>$@ + @echo "XSLTPROC_OPTIONS_RL=\"--stringparam r2l yes\"">>$@ + @echo "STYLESHEET=\"$(apertiumdir)/lexchoicebil.xsl\"">>$@ + @cat transformdic-header.sh >>$@ + @chmod a+x $@ + +apertium-gen-lextormono: Makefile.am transformdic-header.sh + @echo "Creating apertium-gen-lextormono script" + @echo "#!$(BASH)" >$@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "XSLTPROC_OPTIONS_LR=\"\"">>$@ + @echo "XSLTPROC_OPTIONS_RL=\"--stringparam r2l yes\"">>$@ + @echo "STYLESHEET=\"$(apertiumdir)/lexchoice.xsl\"">>$@ + @cat transformdic-header.sh >>$@ + @chmod a+x $@ + +apertium-gen-wlist-lextor: Makefile.am gen-wlist-lextor-header.sh + @echo "Creating apertium-gen-wlist-lextor script" + @echo "#!$(BASH)" >$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @cat gen-wlist-lextor-header.sh >>$@ + @chmod a+x $@ + +apertium-preprocess-corpus-lextor: Makefile.am preprocess-corpus-lextor.sh + @echo "Creating apertium-preprocess-corpus-lextor script" + @echo "#!$(BASH)" >$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @cat preprocess-corpus-lextor.sh >>$@ + @chmod a+x $@ + +apertium-gen-stopwords-lextor: Makefile.am gen-stopwords-lextor.sh + @echo "Creating apertium-gen-stopwords-lextor script" + @echo "#!$(BASH)" >$@ + @cat gen-stopwords-lextor.sh >>$@ + @chmod a+x $@ + +man_MANS=apertium.1 apertium-deshtml.1 apertium-desrtf.1 apertium-destxt.1 \ + apertium-desodt.1 apertium-reodt.1 \ + apertium-deswxml.1 apertium-rewxml.1 \ + apertium-deslatex.1 apertium-relatex.1 \ + apertium-prelatex.1 apertium-postlatex.1 apertium-postlatex-raw.1 \ + apertium-desxlsx.1 apertium-rexlsx.1 \ + apertium-despptx.1 apertium-repptx.1 \ + apertium-desmediawiki.1 apertium-remediawiki.1 \ + apertium-filter-ambiguity.1 apertium-gen-deformat.1 \ + apertium-gen-reformat.1 \ + apertium-preprocess-transfer.1 apertium-pretransfer.1 apertium-rehtml.1 \ + apertium-rertf.1 apertium-retxt.1 apertium-tagger.1 apertium-transfer.1 \ + apertium-validate-dictionary.1 apertium-validate-tagger.1 \ + apertium-validate-transfer.1 apertium-gen-modes.1 apertium-interchunk.1 \ + apertium-postchunk.1 apertium-validate-interchunk.1 apertium-utils-fixlatex.1 \ + apertium-validate-postchunk.1 apertium-validate-modes.1 apertium-tagger-apply-new-rules.1 \ + apertium-validate-acx.1 apertium-multiple-translations.1 \ + apertium-unformat.1 +#DEPR.: +# apertium-lextor-eval.1 +# apertium-gen-lextorbil.1 +# apertium-gen-lextormono.1 apertium-gen-stopwords-lextor.1 +# apertium-gen-wlist-lextor.1 apertium-gen-wlist-lextor-translation.1 +# apertium-lextor.1 apertium-preprocess-corpus-lextor.1 + +EXTRA_DIST = gen-header.sh deformat-header.sh \ + reformat.xsl deformat.xsl new2old.xsl lexchoice.xsl lexchoicebil.xsl \ + txt-format.xml \ + html-format.xml odt-format.xml rtf-format.xml wxml-format.xml latex-format.xml\ + html-noent-format.xml \ + xlsx-format.xml pptx-format.xml mediawiki-format.xml trans-header.sh \ + apertium-postlatex.l apertium-postlatex-raw.l apertium-prelatex.l \ + apertium-header.sh apertium-unformat-header.sh $(man_MANS) \ + xpresstag-format.xml \ + validate-header.sh transformdic-header.sh transformdicbil-header.sh \ + tagger.dtd interchunk.dtd format.dtd transfer.dtd postchunk.dtd modes.dtd \ + tagger.rnc interchunk.rnc format.rnc transfer.rnc postchunk.rnc modes.rnc \ + utils-fixlatex-header.sh \ + apertium-gen-modes.in apertium-createmodes.awk modes2bash.xsl modes2debugmodes.xsl +#DEPR.: +# trans-lextor-header.sh +# gen-wlist-lextor-header.sh +# gen-stopwords-lextor.sh +# preprocess-corpus-lextor.sh Index: branches/apertium-tagger/apertium2/apertium/validate-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/validate-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/validate-header.sh (revision 69632) @@ -0,0 +1,12 @@ +if [[ $# != 1 ]]; then + echo "USAGE: $(basename "$0") " + exit 1 +fi + +FILE1=$1 + +if [[ ! -e $FILE1 ]]; then + echo "ERROR: '$1' file not found" + exit 1 +fi + Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-modes.in =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-modes.in (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-modes.in (revision 69632) @@ -0,0 +1,85 @@ +#!/bin/bash +# Makefile.am prepends APERTIUMDIR, XMLLINT, XSLTPROC and the right shebang + +show_help () { + cat <&2 + exit 1 + ;; + esac +done +shift $((OPTIND-1)) + +xmlfile="$1" +if [[ ! -e "${xmlfile}" ]]; then + echo "ERROR: '${xmlfile}' file not found" + exit 1 +fi +xmldir=$(cd "$(dirname "${xmlfile}")"; pwd) + +case $# in + 1) installdir="${xmldir}";; + 2) if ${fullpath}; then + installdir="$2" + else + installdir="${APERTIUMDIR}/$2" + fi + ;; + *) show_help >&2 + exit 1 + ;; +esac + +$verbose && set -x +set -o pipefail # introduced in bash 3; available in OSX>=10.5; should be safe + +[[ -d "${xmldir}"/modes ]] || mkdir "${xmldir}"/modes + +"${XMLLINT}" --dtdvalid "${APERTIUMDIR}"/modes.dtd --noout "${xmlfile}" || exit $? + +"${XSLTPROC}" "${APERTIUMDIR}"/modes2debugmodes.xsl "${xmlfile}" \ + | "${XSLTPROC}" --stringparam devdir "${xmldir}" \ + --stringparam installdir "${installdir}" \ + "${APERTIUMDIR}"/modes2bash.xsl \ + - \ + | awk -f "${APERTIUMDIR}"/apertium-createmodes.awk Index: branches/apertium-tagger/apertium2/apertium/hmm.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/hmm.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/hmm.h (revision 69632) @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/** + * First order hidden Markov model (HMM) implementation (header) + * + * @author Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es + */ + +#ifndef __HMM_H +#define __HMM_H + +#include "file_tagger.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +#define ZERO 1e-10 + +/** HMM + * first-order hidden Markov Model + */ +class HMM : public Apertium::FILE_Tagger { +private: + TaggerDataHMM tdhmm; + TTag eos; // end-of-sentence tag + + /** It allocs memory for the transition (a) and the emission (b) matrices. + * Before calling this method the number of ambiguity classes must be known. + * This methos is called within read_ambiguity_classes and read_dictionary. + * @see: read_ambiguity_classes, read_dictionary + */ + void init(); +public: + void deserialise(FILE *Serialised_FILE_Tagger); + std::vector &getArrayTags(); + void train(FILE *Corpus, unsigned long Count); + void serialise(FILE *Stream_); + void deserialise(const TaggerData &Deserialised_FILE_Tagger); + void init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *UntaggedCorpus); + void init_probabilities_kupiec_(FILE *Corpus); + HMM(); + HMM(TaggerDataHMM *tdhmm); + + /** Constructor + */ + HMM(TaggerDataHMM tdhmm); + + /** Destructor + */ + ~HMM(); + + /** Used to set the end-of-sentence tag + * @param t the end-of-sentence tag + */ + void set_eos(TTag t); + + /** It reads the ambiguity classes from the stream received as + * input + * @param is the input stream + */ + void read_ambiguity_classes(FILE *in); + + /** It writes the ambiguity classes to the stream received as + * a parameter + * @param iosthe output stream + */ + void write_ambiguity_classes(FILE *out); + + /** It reads the probabilities (matrices a and b) from the stream + * received as a parameter + * @param is the input stream + */ + void read_probabilities(FILE *in); + + /** It writes the probabilities (matrices a and b) to the stream + * received as a parameter + * @param os the output stream + */ + void write_probabilities(FILE *out); + + /** It reads the expanded dictionary received as a parameter and calculates + * the set of ambiguity classes that the tagger will manage. + * @param is the input stream with the expanded dictionary to read + */ + void read_dictionary(FILE *is); + + /** It initializes the transtion (a) and emission (b) probabilities + * from an untagged input text by means of Kupiec's method + * @param is the input stream with the untagged corpus to process + */ + void init_probabilities_kupiec (FILE *is); + + /** It initializes the transtion (a) and emission (b) probabilities + * from a tagged input text by means of the expected-likelihood + * estimate (ELE) method + * @param ftagged the input stream with the tagged corpus to process + * @param funtagged the same corpus to process but untagged + */ + void init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged); + + /** It applies the forbid and enforce rules found in tagger specification. + * To do so the transition matrix is modified by introducing null probabilities + * in the involved transitions. + */ + void apply_rules(); + + /** Unsupervised training algorithm (Baum-Welch implementation). + * @param is the input stream with the untagged corpus to process + */ + void train (FILE *is); + + /** Tagging algorithm (Viterbi implementation). + * @param in the input stream with the untagged text to tag + * @param out the output stream with the tagged text + */ + void tagger(FILE *Input, FILE *Output, const bool &First = false); + + /** Prints the A matrix. + */ + void print_A(); + + /** Prints the B matrix. + */ + void print_B(); + + /** Prints the ambiguity classes. + */ + void print_ambiguity_classes(); + + void filter_ambiguity_classes(FILE *in, FILE *out); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/lswpost.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/lswpost.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lswpost.h (revision 69632) @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/** + * Light Sliding-Window Part of Speech Tagger (LSWPoST) implementation (header) + * + * @author Gang Chen - pkuchengang@gmail.com + */ + +#ifndef __LSWPOST_H +#define __LSWPOST_H + +#include "file_tagger.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + + +#define ZERO 1e-10 + +/** LSWPoST + * Light Sliding-Window Part of Speech Tagger + */ +class LSWPoST : public Apertium::FILE_Tagger { +private: + TaggerDataLSW tdlsw; + TTag eos; // end-of-sentence tag + +public: + void deserialise(FILE *Serialised_FILE_Tagger); + std::vector &getArrayTags(); + void train(FILE *Corpus, unsigned long Count); + void serialise(FILE *Stream_); + void deserialise(const TaggerData &Deserialised_FILE_Tagger); + void init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *UntaggedCorpus); + void init_probabilities_kupiec_(FILE *Corpus); + LSWPoST(); + LSWPoST(TaggerDataLSW *tdlsw); + + /** Constructor + */ + LSWPoST(TaggerDataLSW t); + + /** Destructor + */ + ~LSWPoST(); + + /** Used to set the end-of-sentence tag + * @param t the end-of-sentence tag + */ + void set_eos(TTag t); + + /** It reads the expanded dictionary received as a parameter and calculates + * the set of ambiguity classes that the tagger will manage. + * @param fdic the input stream with the expanded dictionary to read + */ + void read_dictionary(FILE *fdic); + + /** Whether a tag sequence is valid, according to the forbid and enforce rules + */ + bool is_valid_seq(TTag left, TTag mid, TTag right); + + /** Init probabilities + * It applies the forbid and enforce rules found in tagger specification. + * To do so, the joint probability of a tag sequence that contains a forbid + * rule, or doesn't satisfy a enforce rule, is set to 0. + */ + void init_probabilities(FILE *ftxt); + + /** Unsupervised training algorithm (Baum-Welch implementation). + * @param ftxt the input stream with the untagged corpus to process + */ + void train (FILE *ftxt); + + /** Prints the para matrix. + */ + void print_para_matrix(); + + /** Do the tagging + */ + void tagger(FILE *Input, FILE *Output, const bool &First = false); +}; +#endif Index: branches/apertium-tagger/apertium2/apertium/file_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/file_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/file_tagger.h (revision 69632) @@ -0,0 +1,52 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef FILE_TAGGER_H +#define FILE_TAGGER_H + +#include + +#include +#include +#include + +namespace Apertium { +class FILE_Tagger { +public: + FILE_Tagger(); + virtual ~FILE_Tagger(); + virtual void deserialise(FILE *Serialised_FILE_Tagger) = 0; + void set_debug(const bool &Debug); + void set_show_sf(const bool &ShowSuperficial); + void setNullFlush(const bool &NullFlush); + virtual void tagger(FILE *Input, FILE *Output, const bool &First = false) = 0; + virtual std::vector &getArrayTags() = 0; + virtual void train(FILE *Corpus, unsigned long Count) = 0; + virtual void serialise(FILE *Stream_) = 0; + void deserialise(char *const TaggerSpecificationFilename); + virtual void read_dictionary(FILE *Dictionary) = 0; + virtual void init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *Corpus) = 0; + virtual void init_probabilities_kupiec_(FILE *Corpus) = 0; + +protected: + virtual void deserialise(const TaggerData &Deserialised_FILE_Tagger) = 0; + bool debug; + bool show_sf; + bool null_flush; +}; +} + +#endif // FILE_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/format.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/format.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/format.rng (revision 69632) @@ -0,0 +1,303 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + yes + no + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + comment + empty + open + close + + + + + + + + + yes + no + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + yes + no + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/interchunk.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk.rng (revision 69632) @@ -0,0 +1,971 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/postchunk.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/postchunk.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/postchunk.rng (revision 69632) @@ -0,0 +1,971 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/tagger.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger.rng (revision 69632) @@ -0,0 +1,310 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/format.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/format.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/format.rnc (revision 69632) @@ -0,0 +1,111 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# DTD for the format specification files +# Sergio Ortiz 2005.05.13 + +format = element format { attlist.format, options, rules } +attlist.format &= attribute name { text } +# 'format' is the root element containing the whole format specification +# file. The attribute 'name' specifies the name of the format +options = + element options { + attlist.options, + largeblocks, + input, + output, + tag-name, + escape-chars, + space-chars, + case-sensitive + } +attlist.options &= empty +# General options of the format +largeblocks = element largeblocks { attlist.largeblocks, empty } +attlist.largeblocks &= attribute size { text } +# The attribute size is used to define the maximal size in bytes of +# inline format blocks +input = element input { attlist.input, empty } +attlist.input &= attribute zip-path { text }? +attlist.input &= attribute encoding { text } +# Reserved for future extensions +output = element output { attlist.output, empty } +attlist.output &= attribute zip-path { text }? +attlist.output &= attribute encoding { text } +# Reserved for future extensions +tag-name = element tag-name { attlist.tag-name, empty } +attlist.tag-name &= attribute regexp { text } +# The attribute regexp defines (whith a _flex_ regular expression) how +# take a tag name from a whole tag. '\' +escape-chars = element escape-chars { attlist.escape-chars, empty } +attlist.escape-chars &= attribute regexp { text } +# The attribute regexp defines (whith a _flex_ regular expression) the +# set of characters to be escaped with preceding a backslash '\' +space-chars = element space-chars { attlist.space-chars, empty } +attlist.space-chars &= attribute regexp { text } +# Define the space characters (in regexp) with a _flex_ regular +# expression +case-sensitive = + element case-sensitive { attlist.case-sensitive, empty } +attlist.case-sensitive &= attribute value { "yes" | "no" } +# The attribute 'value' is set to 'yes' if the case is relevant in the +# specification of the format. Otherwise is set to 'no' +rules = + element rules { attlist.rules, (format-rule | replacement-rule)+ } +attlist.rules &= empty +# Group the rules of processing format and the rules of substitute +# expressions by characters that are part of the text +format-rule = + element format-rule { + attlist.format-rule, + (tag | (begin, end)) + } +attlist.format-rule &= + attribute type { "comment" | "empty" | "open" | "close" }? +attlist.format-rule &= attribute eos { "yes" | "no" }? +attlist.format-rule &= attribute priority { text } +# Format rule parent element. It may include a 'tag' element or +# a couple of elements 'begin', 'end'. In the first case, this element is +# considered to be part of the format. In the second case, the begin and +# the end element are considered to enclosing format. The attribute +# 'eos' (end of sentence) is set to 'yes' if that rule defines a dot in +# the text being processed (is no by default). The attribute 'priority' +# marks the order of precedence of the rule +tag = element tag { attlist.tag, empty } +attlist.tag &= attribute regexp { text } +# Define an element that is part of the format by the pattern specified +# as a value for the regexp attribute +begin = element begin { attlist.begin, empty } +attlist.begin &= attribute regexp { text } +# The attribute 'regexp' is the regular expression that detects the +# begining delimiter of a block of format +end = element end { attlist.end, empty } +attlist.end &= attribute regexp { text } +# The attribute 'regexp' is the regular expression that detects the +# ending delimiter of a block of format +replacement-rule = + element replacement-rule { attlist.replacement-rule, replace+ } +attlist.replacement-rule &= attribute regexp { text } +# Root element for a replacement rule. The attribute 'regexp' is the +# general expression to detect the elements to replace +replace = element replace { attlist.replace, empty } +attlist.replace &= attribute source { text } +attlist.replace &= attribute target { text } +attlist.replace &= attribute prefer { "yes" | "no" }? +start = format +# Replacement rule. The 'source' is a string of one or more characters. +# The 'target' MUST be a single character. The 'prefer' attribute, when +# set to 'yes' defines the preferred reverse translation of the +# replacement. Index: branches/apertium-tagger/apertium2/apertium/interchunk.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk.rnc (revision 69632) @@ -0,0 +1,353 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# Draft of DTD for the structural transfer rule files +# +# Sergio Ortiz, Gema Ramírez-Sánchez, Mireia Ginestí, Mikel L. Forcada, +# 2005.07.29. + +condition = + and + | or + | not + | equal + | begins-with + | begins-with-list + | ends-with + | ends-with-list + | contains-substring + | in +container = var | clip +sentence = let | out | choose | modify-case | call-macro | append +value = + b + | clip + | lit + | lit-tag + | var + | get-case-from + | case-of + | concat + | chunk +stringvalue = clip | lit | var | get-case-from | case-of +interchunk = + element interchunk { + attlist.interchunk, + section-def-cats, + section-def-attrs, + section-def-vars, + section-def-lists?, + section-def-macros?, + section-rules + } +attlist.interchunk &= empty +# 'interchunk' is the root element containing the whole structural +# interchunk rule file. +section-def-cats = + element section-def-cats { attlist.section-def-cats, def-cat+ } +attlist.section-def-cats &= empty +# The 'def-cats' section defines the categories used to build the +# patterns used in rules +def-cat = element def-cat { attlist.def-cat, cat-item+ } +attlist.def-cat &= + attribute n { xsd:ID }, + attribute c { text }? +# Each 'def-cat' defines one category in terms of a list of +# category items and has a unique name 'n', which is mandatory +cat-item = element cat-item { attlist.cat-item, empty } +attlist.cat-item &= + attribute lemma { text }?, + attribute tags { text }, + attribute c { text }? +# Each 'cat-item' (category item) represents a set of lexical forms +# and has a mandatory attribute 'tags' whose value is a sequence of +# dot-separated tag names; this sequence is a subsequence of the +# tag sequence defining each possible lexical form. For example, +# tags="n.f" would match all lexical forms containing this tag +# sequence, such as "^casa$". +# +# In addition, an optional attribute, "lemma", may be used to +# define lexical forms having a particular substring in their lemma +section-def-attrs = + element section-def-attrs { attlist.section-def-attrs, def-attr+ } +attlist.section-def-attrs &= empty +# The 'def-attrs' section defines the attributes that will be +# identified in matched lexical forms +def-attr = element def-attr { attlist.def-attr, attr-item+ } +attlist.def-attr &= + attribute n { xsd:ID }, + attribute c { text }? +# Each def-attr defines one attribute in terms of a list of +# attribute items and has a mandatory unique name n +attr-item = element attr-item { attlist.attr-item, empty } +attlist.attr-item &= + attribute tags { text }?, + attribute c { text }? +# Each 'attr-item' specifies a subsequence of the tags in +# that lexical form (attribute 'tags') +section-def-vars = + element section-def-vars { attlist.section-def-vars, def-var+ } +attlist.section-def-vars &= empty +# The 'def-vars' section defines the global variables +# that will be used to transfer information between rules +def-var = element def-var { attlist.def-var, empty } +attlist.def-var &= + attribute n { xsd:ID }, + attribute v { text }?, + attribute c { text }? +# The definition of a global variable has a mandatory unique name 'n' that +# will be used to refer to it. A value of initialization can also be specified +# by means the 'v' attribute. The default value of the initialization is the +# empty string. +section-def-lists = + element section-def-lists { attlist.section-def-lists, def-list+ } +attlist.section-def-lists &= empty +# Element 'section-def-lists' encloses a set of list definitions +def-list = element def-list { attlist.def-list, list-item+ } +attlist.def-list &= + attribute n { xsd:ID }, + attribute c { text }? +# The 'def-list' element defines a named list to search with the 'in' +# element. Attribute 'n' sets the name of the list +list-item = element list-item { attlist.list-item, empty } +attlist.list-item &= + attribute v { text }, + attribute c { text }? +# Attribute 'v' of 'list-item' element contains the value to be added to +# the list being defined +section-def-macros = + element section-def-macros { attlist.section-def-macros, def-macro+ } +attlist.section-def-macros &= empty +# +# The 'def-macros' section defines macros containing portions of +# code frequently used in the action part of rules +# +def-macro = element def-macro { attlist.def-macro, sentence+ } +attlist.def-macro &= attribute n { xsd:ID } +attlist.def-macro &= + attribute npar { text }, + attribute c { text }? +# Macro definition: +# +# A macro has a mandatory name (the value of 'n'), a number of parameters +# (the value of 'npar') and a body containing arguments and statements. +section-rules = element section-rules { attlist.section-rules, rule+ } +attlist.section-rules &= empty +# The rules section contains a sequence of one or more rules +rule = element rule { attlist.rule, pattern, action } +attlist.rule &= attribute comment { text }? +# Each rule has a pattern and an action +# * attribute 'comment' allows to put in comments about the purpose of +# the rule being defined +pattern = element pattern { attlist.pattern, pattern-item+ } +attlist.pattern &= empty +# The pattern is specified in terms of pattern items, each one +# representing a lexical form in the matched pattern +pattern-item = element pattern-item { attlist.pattern-item, empty } +attlist.pattern-item &= attribute n { xsd:IDREF } +# Each attribute to be activated is referred to by its name in the def-cats section +action = element action { attlist.action, sentence* } +attlist.action &= attribute c { text }? +# Encloses the procedural part of a rule +choose = element choose { attlist.choose, when+, otherwise? } +attlist.choose &= attribute c { text }? +# The choose statement is a selection statement (similar to a case +# statement) composed of one or more tested cases and an optional +# otherwise +when = element when { attlist.when, test, sentence* } +attlist.when &= attribute c { text }? +# Each tested case is a block of zero or more statements +otherwise = element otherwise { attlist.otherwise, sentence+ } +attlist.otherwise &= attribute c { text }? +# The otherwise case is also a block of one or more statements +test = element test { attlist.test, condition } +attlist.test &= attribute c { text }? +# The test in a tested case may be a conjunction, a disjunction, or +# a negation of simpler tests, as well as a simple equality test +and = element and { attlist.and, condition, condition+ } +attlist.and &= empty +# Each conjuntion test contains two or more simpler tests +or = element or { attlist.or, condition, condition+ } +attlist.or &= empty +# Each disjunction test contains two or more simpler tests +not = element not { attlist.not, condition } +attlist.not &= empty +# The negation of a simpler test is a test itself +equal = element equal { attlist.equal, value, value } +attlist.equal &= attribute caseless { "no" | "yes" }? +# The simplest test is an equality test. The right part and the +# left part of the equality may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with = element begins-with { attlist.begins-with, value, value } +attlist.begins-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with = element ends-with { attlist.ends-with, value, value } +attlist.ends-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with-list = + element begins-with-list { attlist.begins-with-list, value, \list } +attlist.begins-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with-list = + element ends-with-list { attlist.ends-with-list, value, \list } +attlist.ends-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +contains-substring = + element contains-substring { + attlist.contains-substring, value, value + } +attlist.contains-substring &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +in = element in { attlist.in, value, \list } +attlist.in &= attribute caseless { "no" | "yes" }? +# 'in' performs a search of a value in a list. If 'caseless' is set to yes, +# this search is performed without attending to the case +\list = element list { attlist.list, empty } +attlist.list &= attribute n { xsd:IDREF } +# 'list' refers, with the name in attribute 'n', a list defined before in +# the 'section-def-list' section +let = element let { attlist.let, container, value } +attlist.let &= empty +# An assignment statement ('let') assigns the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip +append = element append { attlist.append, value+ } +attlist.append &= attribute n { xsd:IDREF } +# This instruction appends the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip, identified by the "n" attribute +out = element out { attlist.out, (b | chunk | var)+ } +attlist.out &= attribute c { text }? +# 'out' is an output statement; it may output blanks or chunks +modify-case = + element modify-case { attlist.modify-case, container, stringvalue } +attlist.modify-case &= empty +# The first argument of 'modify-case' copy the case of the second +# argument. +call-macro = element call-macro { attlist.call-macro, with-param* } +attlist.call-macro &= attribute n { xsd:IDREF } +# A macro may be called anywhere by name with one or more +# arguments +with-param = element with-param { attlist.with-param, empty } +attlist.with-param &= attribute pos { text } +# The attribute pos in each argument is used to refer to a lexical +# form in the current rule. For example, if a 2-parameter macro +# has been defined to perform noun-adjective agreement operations, +# it may be used with arguments 1 and 2 in a noun-adjective rule, +# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with +# arguments 1 and 3 in a noun-adverb-adjective rule, and with +# arguments 2 and 1 in an adjective-noun rule +clip = element clip { attlist.clip, empty } +attlist.clip &= + attribute pos { text }, + attribute part { text }, + attribute c { text }? +# A 'clip' is a substring of a source-language or target-language +# lexical form, extracted according to an attribute: +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +# +lit = element lit { attlist.lit, empty } +attlist.lit &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +lit-tag = element lit-tag { attlist.lit-tag, empty } +attlist.lit-tag &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +var = element var { attlist.var, empty } +attlist.var &= attribute n { xsd:IDREF } +# Each 'var' is a variable identifier: the attribute n is the name +# of the variable. When it is in an 'out', a 'test', or the right +# part of a 'let', it represents the value of the variable; when in +# the left part of a 'let' it represents the reference of the +# variable. +get-case-from = + element get-case-from { attlist.get-case-from, (clip | lit | var) } +attlist.get-case-from &= attribute pos { text } +# Atención, falta modificar todos los comentarios donde intervenga +# get-case-from +case-of = element case-of { attlist.case-of, empty } +attlist.case-of &= + attribute pos { text }, + attribute part { text } +# A 'case-of' is a value representing the case of a "clip". This value +# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA", +# (all uppercase). +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +concat = element concat { attlist.concat, value+ } +attlist.concat &= empty +# Concatenates a sequence of values +chunk = element chunk { attlist.chunk, value+ } +attlist.chunk &= empty +# Encloses a chunk +pseudolemma = element pseudolemma { attlist.pseudolemma, value } +attlist.pseudolemma &= empty +b = element b { attlist.b, empty } +attlist.b &= attribute pos { text }? +start = interchunk | pseudolemma +# 'b' is a [super]blanks item, indexed by pos; for example, a 'b' +# with pos="2" refers to the [super]blanks (including format data +# encapsulated by the de-formatter) between lexical form 2 and +# lexical form 3. Managing [super]blanks explicitly allows for the +# correct placement of format when the result of structural +# transfer has more or less lexical items than the original or has +# been reordered in some way. If attribute "pos" is not specified, then +# a single blank (ASCII 32) is generated. Index: branches/apertium-tagger/apertium2/apertium/postchunk.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/postchunk.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/postchunk.rnc (revision 69632) @@ -0,0 +1,348 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# Draft of DTD for the structural transfer rule files +# +# Sergio Ortiz, Gema Ramírez-Sánchez, Mireia Ginestí, Mikel L. Forcada, +# 2005.07.29. + +condition = + and + | or + | not + | equal + | begins-with + | begins-with-list + | ends-with + | ends-with-list + | contains-substring + | in +container = var | clip +sentence = let | out | choose | modify-case | call-macro | append +value = + b + | clip + | lit + | lit-tag + | var + | get-case-from + | case-of + | concat + | lu-count + | lu + | mlu +stringvalue = clip | lit | var | get-case-from | case-of | lu-count +postchunk = + element postchunk { + attlist.postchunk, + section-def-cats, + section-def-attrs, + section-def-vars, + section-def-lists?, + section-def-macros?, + section-rules + } +attlist.postchunk &= empty +# 'postchunk' is the root element containing the whole structural +# postchunk rule file. +section-def-cats = + element section-def-cats { attlist.section-def-cats, def-cat+ } +attlist.section-def-cats &= empty +# The 'def-cats' section defines the categories used to build the +# patterns used in rules +def-cat = element def-cat { attlist.def-cat, cat-item+ } +attlist.def-cat &= + attribute n { xsd:ID }, + attribute c { text }? +# Each 'def-cat' defines one category in terms of a list of +# category items and has a unique name 'n', which is mandatory +cat-item = element cat-item { attlist.cat-item, empty } +attlist.cat-item &= attribute name { text } +# In addition, a required attribute, "name", is used to specify +# wich chunk name is detected by this cat-item +section-def-attrs = + element section-def-attrs { attlist.section-def-attrs, def-attr+ } +attlist.section-def-attrs &= empty +# The 'def-attrs' section defines the attributes that will be +# identified in matched lexical forms +def-attr = element def-attr { attlist.def-attr, attr-item+ } +attlist.def-attr &= + attribute n { xsd:ID }, + attribute c { text }? +# Each def-attr defines one attribute in terms of a list of +# attribute items and has a mandatory unique name n +attr-item = element attr-item { attlist.attr-item, empty } +attlist.attr-item &= + attribute tags { text }?, + attribute c { text }? +# Each 'attr-item' specifies a subsequence of the tags in +# that lexical form (attribute 'tags') +section-def-vars = + element section-def-vars { attlist.section-def-vars, def-var+ } +attlist.section-def-vars &= empty +# The 'def-vars' section defines the global variables +# that will be used to transfer information between rules +def-var = element def-var { attlist.def-var, empty } +attlist.def-var &= + attribute n { xsd:ID }, + attribute v { text }?, + attribute c { text }? +# The definition of a global variable has a mandatory unique name 'n' that +# will be used to refer to it. A value of initialization can also be specified +# by means the 'v' attribute. The default value of the initialization is the +# empty string. +section-def-lists = + element section-def-lists { attlist.section-def-lists, def-list+ } +attlist.section-def-lists &= empty +# Element 'section-def-lists' encloses a set of list definitions +def-list = element def-list { attlist.def-list, list-item+ } +attlist.def-list &= + attribute n { xsd:ID }, + attribute c { text }? +# The 'def-list' element defines a named list to search with the 'in' +# element. Attribute 'n' sets the name of the list +list-item = element list-item { attlist.list-item, empty } +attlist.list-item &= + attribute v { text }, + attribute c { text }? +# Attribute 'v' of 'list-item' element contains the value to be added to +# the list being defined +section-def-macros = + element section-def-macros { attlist.section-def-macros, def-macro+ } +attlist.section-def-macros &= empty +# +# The 'def-macros' section defines macros containing portions of +# code frequently used in the action part of rules +# +def-macro = element def-macro { attlist.def-macro, sentence+ } +attlist.def-macro &= attribute n { xsd:ID } +attlist.def-macro &= + attribute npar { text }, + attribute c { text }? +# Macro definition: +# +# A macro has a mandatory name (the value of 'n'), a number of parameters +# (the value of 'npar') and a body containing arguments and statements. +section-rules = element section-rules { attlist.section-rules, rule+ } +attlist.section-rules &= empty +# The rules section contains a sequence of one or more rules +rule = element rule { attlist.rule, pattern, action } +attlist.rule &= attribute comment { text }? +# Each rule has a pattern and an action +# * Attribute 'comment' allows to include a comment with the rule +pattern = element pattern { attlist.pattern, pattern-item } +attlist.pattern &= empty +# The pattern is specified in terms of pattern items, each one +# representing a lexical form in the matched pattern +pattern-item = element pattern-item { attlist.pattern-item, empty } +attlist.pattern-item &= attribute n { xsd:IDREF } +# Each attribute to be activated is referred to by its name in the def-cats section +action = element action { attlist.action, sentence* } +attlist.action &= attribute c { text }? +# Encloses the procedural part of a rule +choose = element choose { attlist.choose, when+, otherwise? } +attlist.choose &= attribute c { text }? +# The choose statement is a selection statement (similar to a case +# statement) composed of one or more tested cases and an optional +# otherwise +when = element when { attlist.when, test, sentence* } +attlist.when &= attribute c { text }? +# Each tested case is a block of zero or more statements +otherwise = element otherwise { attlist.otherwise, sentence+ } +attlist.otherwise &= attribute c { text }? +# The otherwise case is also a block of one or more statements +test = element test { attlist.test, condition } +attlist.test &= attribute c { text }? +# The test in a tested case may be a conjunction, a disjunction, or +# a negation of simpler tests, as well as a simple equality test +and = element and { attlist.and, condition, condition+ } +attlist.and &= empty +# Each conjuntion test contains two or more simpler tests +or = element or { attlist.or, condition, condition+ } +attlist.or &= empty +# Each disjunction test contains two or more simpler tests +not = element not { attlist.not, condition } +attlist.not &= empty +# The negation of a simpler test is a test itself +equal = element equal { attlist.equal, value, value } +attlist.equal &= attribute caseless { "no" | "yes" }? +# The simplest test is an equality test. The right part and the +# left part of the equality may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with = element begins-with { attlist.begins-with, value, value } +attlist.begins-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with = element ends-with { attlist.ends-with, value, value } +attlist.ends-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with-list = + element begins-with-list { attlist.begins-with-list, value, \list } +attlist.begins-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with-list = + element ends-with-list { attlist.ends-with-list, value, \list } +attlist.ends-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +contains-substring = + element contains-substring { + attlist.contains-substring, value, value + } +attlist.contains-substring &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +in = element in { attlist.in, value, \list } +attlist.in &= attribute caseless { "no" | "yes" }? +# 'in' performs a search of a value in a list. If 'caseless' is set to yes, +# this search is performed without attending to the case +\list = element list { attlist.list, empty } +attlist.list &= attribute n { xsd:IDREF } +# 'list' refers, with the name in attribute 'n', a list defined before in +# the 'section-def-list' section +let = element let { attlist.let, container, value } +attlist.let &= empty +# An assignment statement ('let') assigns the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip +append = element append { attlist.append, value+ } +attlist.append &= attribute n { xsd:IDREF } +# This instruction appends the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip, identified by the "n" attribute +out = element out { attlist.out, (b | lu | mlu | var)+ } +attlist.out &= attribute c { text }? +# 'out' is an output statement; it may output blanks or chunks +modify-case = + element modify-case { attlist.modify-case, container, stringvalue } +attlist.modify-case &= empty +# The first argument of 'modify-case' copy the case of the second +# argument. +call-macro = element call-macro { attlist.call-macro, with-param* } +attlist.call-macro &= attribute n { xsd:IDREF } +# A macro may be called anywhere by name with one or more +# arguments +with-param = element with-param { attlist.with-param, empty } +attlist.with-param &= attribute pos { text } +# The attribute pos in each argument is used to refer to a lexical +# form in the current rule. For example, if a 2-parameter macro +# has been defined to perform noun-adjective agreement operations, +# it may be used with arguments 1 and 2 in a noun-adjective rule, +# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with +# arguments 1 and 3 in a noun-adverb-adjective rule, and with +# arguments 2 and 1 in an adjective-noun rule +clip = element clip { attlist.clip, empty } +attlist.clip &= + attribute pos { text }, + attribute part { text }, + attribute c { text }? +# A 'clip' is a substring of a source-language or target-language +# lexical form, extracted according to an attribute: +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +# +lit = element lit { attlist.lit, empty } +attlist.lit &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +lit-tag = element lit-tag { attlist.lit-tag, empty } +attlist.lit-tag &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +var = element var { attlist.var, empty } +attlist.var &= attribute n { xsd:IDREF } +# Each 'var' is a variable identifier: the attribute n is the name +# of the variable. When it is in an 'out', a 'test', or the right +# part of a 'let', it represents the value of the variable; when in +# the left part of a 'let' it represents the reference of the +# variable. +get-case-from = + element get-case-from { attlist.get-case-from, (clip | lit | var) } +attlist.get-case-from &= attribute pos { text } +# Atención, falta modificar todos los comentarios donde intervenga +# get-case-from +case-of = element case-of { attlist.case-of, empty } +attlist.case-of &= + attribute pos { text }, + attribute part { text } +# A 'case-of' is a value representing the case of a "clip". This value +# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA", +# (all uppercase). +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +concat = element concat { attlist.concat, value+ } +attlist.concat &= empty +# Concatenates a sequence of values +mlu = element mlu { attlist.mlu, lu+ } +attlist.mlu &= empty +# Encloses a multiword +lu = element lu { attlist.lu, value+ } +attlist.lu &= empty +# Encloses a word +b = element b { attlist.b, empty } +attlist.b &= attribute pos { text }? +# 'b' is a [super]blanks item, indexed by pos; for example, a 'b' +# with pos="2" refers to the [super]blanks (including format data +# encapsulated by the de-formatter) between lexical form 2 and +# lexical form 3. Managing [super]blanks explicitly allows for the +# correct placement of format when the result of structural +# transfer has more or less lexical items than the original or has +# been reordered in some way. If attribute "pos" is not specified, then +# a single blank (ASCII 32) is generated. +lu-count = element lu-count { attlist.lu-count, empty } +attlist.lu-count &= empty +start = postchunk +# Number of lexical units (words inside the chunk) in the rule Index: branches/apertium-tagger/apertium2/apertium/tagger.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger.rnc (revision 69632) @@ -0,0 +1,122 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# DTD for the tagset and the rules to enforce the state to state +# transition probabilities used by the part-of-speech tagger. +# 2005.07.29. + +tagger = + element tagger { + attlist.tagger, + tagset, + forbid?, + enforce-rules?, + preferences?, + discard-on-ambiguity? + } +attlist.tagger &= attribute name { text } +# 'tagger' is the root element containing the whole tagset for a given +# language specified through the mandatory attribute 'name' +tagset = element tagset { attlist.tagset, def-label+, def-mult* } +attlist.tagset &= empty +# The 'tagset' section defines the correspondance between simple +# or multiple morphological categories defining a lexical form and the coarser +# ones with which the part-of-speech tagger works +def-label = element def-label { attlist.def-label, tags-item+ } +attlist.def-label &= + attribute name { text }, + attribute c { text }?, + attribute closed { text }? +# Each 'def-label' defines one coarse tag in terms of a list of fine tags +# and has a mandatory unique name. The optional attribute 'closed="true"' may be used +# to specify if the defined fine tags belong to a closed list. +# c is for comments and is ignored +tags-item = element tags-item { attlist.tags-item, empty } +attlist.tags-item &= + attribute tags { text }, + attribute lemma { text }? +# Each 'tags-item' may be a dot-separated subsequence of the morphological tags +# corresponding to a coarse tag optionally in association with a given lemma +def-mult = element def-mult { attlist.def-mult, sequence+ } +attlist.def-mult &= + attribute name { text }, + attribute c { text }?, + attribute closed { text }? +# Each 'def-mult' defines one coarse tag in terms of a sequence of coarse +# tags previously defined as 'def-labels' or a sequence of fine tags. A mandatory +# name is required for each 'def-mult' which may also has an optional attribute +# 'closed="true"' if it belongs to a closed list +# c is for comments and is ignored +sequence = + element sequence { attlist.sequence, (tags-item | label-item)+ } +attlist.sequence &= empty +# Element 'sequence' encloses a set of tags or labels which defines +# a unit with more than one label +label-item = element label-item { attlist.label-item, empty } +attlist.label-item &= + attribute label { text }, + attribute c { text }? +# Each 'label' of the 'label-item' correspond to a coarse tag previously +# defined as a 'def-label' by a name. +# c is for comments and is ignored +forbid = element forbid { attlist.forbid, label-sequence+ } +attlist.forbid &= empty +# Element 'forbid' contains sequences of morphological categories that are not +# allowed in a given language +label-sequence = + element label-sequence { attlist.label-sequence, label-item+ } +attlist.label-sequence &= attribute c { text }? +# Each 'label-sequence' is restricted to two 'label-items' +# c is for comments and is ignored +enforce-rules = + element enforce-rules { attlist.enforce-rules, enforce-after+ } +attlist.enforce-rules &= empty +# Element 'enforce-rules' defines sets of coarse tags that must follow specified ones +enforce-after = + element enforce-after { attlist.enforce-after, label-set } +attlist.enforce-after &= + attribute label { text }, + attribute c { text }? +# Each 'enforce-after' encloses the set of coarse tags ('label-set') that must follow +# the one defined in 'label', as a mandatory attribute +# c is for comments and is ignored +label-set = element label-set { attlist.label-set, label-item+ } +attlist.label-set &= attribute c { text }? +# The set of 'label-items' enforced after a 'label' are enclosed inside element 'label-set' +# c is for comments and is ignored +preferences = element preferences { attlist.preferences, prefer+ } +attlist.preferences &= empty +# Element 'preferences' allows to decide amongst two or more fine tag sequences +# which are grouped in the same coarse tag. +prefer = element prefer { attlist.prefer, empty } +attlist.prefer &= + attribute tags { text }, + attribute c { text }? +# Each 'prefer' element has a mandatory attribute 'tags' made of a sequence of fine tags +# c is for comments and is ignored +discard-on-ambiguity = + element discard-on-ambiguity { + attlist.discard-on-ambiguity, discard+ + } +attlist.discard-on-ambiguity &= empty +# List of label-item or tags-item to be discarded when an ambiguity +# occurs inside a word +discard = element discard { attlist.discard, empty } +attlist.discard &= + attribute tags { text }, + attribute c { text }? +start = tagger +# Each 'discard' element has a mandatory attribute 'tags' made of a sequence of fine tags +# c is for comments and is ignored Index: branches/apertium-tagger/apertium2/apertium/transfer.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.rnc (revision 69632) @@ -0,0 +1,407 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# Draft of DTD for the structural transfer rule files +# +# Sergio Ortiz, Gema Ramírez-Sánchez, Mireia Ginestí, Mikel L. Forcada, +# 2005.07.29. + +condition = + and + | or + | not + | equal + | begins-with + | begins-with-list + | ends-with + | ends-with-list + | contains-substring + | in +container = var | clip +sentence = + let + | out + | choose + | modify-case + | call-macro + | append + | reject-current-rule +value = + b + | clip + | lit + | lit-tag + | var + | get-case-from + | case-of + | concat + | lu + | mlu + | chunk +stringvalue = clip | lit | var | get-case-from | case-of +transfer = + element transfer { + attlist.transfer, + section-def-cats, + section-def-attrs?, + section-def-vars?, + section-def-lists?, + section-def-macros?, + section-rules + } +attlist.transfer &= attribute default { "lu" | "chunk" }? +# 'transfer' is the root element containing the whole structural +# transfer rule file. Attribute 'default' specifies if +# unmatched words have to be written as lexical units ("lu", this is +# the default value) or as chunks ("chunk"). +section-def-cats = + element section-def-cats { attlist.section-def-cats, def-cat+ } +attlist.section-def-cats &= empty +# The 'def-cats' section defines the categories used to build the +# patterns used in rules +def-cat = element def-cat { attlist.def-cat, cat-item+ } +attlist.def-cat &= + attribute n { xsd:ID }, + attribute c { text }? +# Each 'def-cat' defines one category in terms of a list of +# category items and has a unique name 'n', which is mandatory +cat-item = element cat-item { attlist.cat-item, empty } +attlist.cat-item &= + attribute lemma { text }?, + attribute tags { text }, + attribute c { text }? +# Each 'cat-item' (category item) represents a set of lexical forms +# and has a mandatory attribute 'tags' whose value is a sequence of +# dot-separated tag names; this sequence is a subsequence of the +# tag sequence defining each possible lexical form. For example, +# tags="n.f" would match all lexical forms containing this tag +# sequence, such as "^casa$". +# +# In addition, an optional attribute, "lemma", may be used to +# define lexical forms having a particular substring in their lemma +section-def-attrs = + element section-def-attrs { attlist.section-def-attrs, def-attr+ } +attlist.section-def-attrs &= empty +# The 'def-attrs' section defines the attributes that will be +# identified in matched lexical forms +def-attr = element def-attr { attlist.def-attr, attr-item+ } +attlist.def-attr &= + attribute n { xsd:ID }, + attribute c { text }? +# Each def-attr defines one attribute in terms of a list of +# attribute items and has a mandatory unique name n +attr-item = element attr-item { attlist.attr-item, empty } +attlist.attr-item &= + attribute tags { text }?, + attribute c { text }? +# Each 'attr-item' specifies a subsequence of the tags in +# that lexical form (attribute 'tags') +section-def-vars = + element section-def-vars { attlist.section-def-vars, def-var+ } +attlist.section-def-vars &= empty +# The 'def-vars' section defines the global variables +# that will be used to transfer information between rules +def-var = element def-var { attlist.def-var, empty } +attlist.def-var &= + attribute n { xsd:ID }, + attribute v { text }?, + attribute c { text }? +# The definition of a global variable has a mandatory unique name 'n' that +# will be used to refer to it. A value of initialization can also be specified +# by means the 'v' attribute. The default value of the initialization is the +# empty string. +section-def-lists = + element section-def-lists { attlist.section-def-lists, def-list+ } +attlist.section-def-lists &= empty +# Element 'section-def-lists' encloses a set of list definitions +def-list = element def-list { attlist.def-list, list-item+ } +attlist.def-list &= + attribute n { xsd:ID }, + attribute c { text }? +# The 'def-list' element defines a named list to search with the 'in' +# element. Attribute 'n' sets the name of the list +list-item = element list-item { attlist.list-item, empty } +attlist.list-item &= + attribute v { text }, + attribute c { text }? +# Attribute 'v' of 'list-item' element contains the value to be added to +# the list being defined +section-def-macros = + element section-def-macros { attlist.section-def-macros, def-macro+ } +attlist.section-def-macros &= empty +# +# The 'def-macros' section defines macros containing portions of +# code frequently used in the action part of rules +# +def-macro = element def-macro { attlist.def-macro, sentence+ } +attlist.def-macro &= attribute n { xsd:ID } +attlist.def-macro &= + attribute npar { text }, + attribute c { text }? +# Macro definition: +# +# A macro has a mandatory name (the value of 'n'), a number of parameters +# (the value of 'npar') and a body containing arguments and statements. +section-rules = element section-rules { attlist.section-rules, rule+ } +attlist.section-rules &= empty +# The rules section contains a sequence of one or more rules +rule = element rule { attlist.rule, pattern, action } +attlist.rule &= attribute comment { text }? +# Each rule has a pattern and an action +# * attribute 'comment' allows to put in comments about the purpose of +# the rule being defined +pattern = element pattern { attlist.pattern, pattern-item+ } +attlist.pattern &= empty +# The pattern is specified in terms of pattern items, each one +# representing a lexical form in the matched pattern +pattern-item = element pattern-item { attlist.pattern-item, empty } +attlist.pattern-item &= attribute n { xsd:IDREF } +# Each attribute to be activated is referred to by its name in the def-cats section +action = element action { attlist.action, sentence* } +attlist.action &= attribute c { text }? +# Encloses the procedural part of a rule +choose = element choose { attlist.choose, when+, otherwise? } +attlist.choose &= attribute c { text }? +# The choose statement is a selection statement (similar to a case +# statement) composed of one or more tested cases and an optional +# otherwise +when = element when { attlist.when, test, sentence* } +attlist.when &= attribute c { text }? +# Each tested case is a block of zero or more statements +otherwise = element otherwise { attlist.otherwise, sentence+ } +attlist.otherwise &= attribute c { text }? +# The otherwise case is also a block of one or more statements +test = element test { attlist.test, condition } +attlist.test &= attribute c { text }? +# The test in a tested case may be a conjunction, a disjunction, or +# a negation of simpler tests, as well as a simple equality test +and = element and { attlist.and, condition, condition+ } +attlist.and &= empty +# Each conjuntion test contains two or more simpler tests +or = element or { attlist.or, condition, condition+ } +attlist.or &= empty +# Each disjunction test contains two or more simpler tests +not = element not { attlist.not, condition } +attlist.not &= empty +# The negation of a simpler test is a test itself +equal = element equal { attlist.equal, value, value } +attlist.equal &= attribute caseless { "no" | "yes" }? +# The simplest test is an equality test. The right part and the +# left part of the equality may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with = element begins-with { attlist.begins-with, value, value } +attlist.begins-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with = element ends-with { attlist.ends-with, value, value } +attlist.ends-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with-list = + element begins-with-list { attlist.begins-with-list, value, \list } +attlist.begins-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with-list = + element ends-with-list { attlist.ends-with-list, value, \list } +attlist.ends-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +contains-substring = + element contains-substring { + attlist.contains-substring, value, value + } +attlist.contains-substring &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +in = element in { attlist.in, value, \list } +attlist.in &= attribute caseless { "no" | "yes" }? +# 'in' performs a search of a value in a list. If 'caseless' is set to yes, +# this search is performed without attending to the case +\list = element list { attlist.list, empty } +attlist.list &= attribute n { xsd:IDREF } +# 'list' refers, with the name in attribute 'n', a list defined before in +# the 'section-def-list' section +let = element let { attlist.let, container, value } +attlist.let &= empty +# An assignment statement ('let') assigns the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip +append = element append { attlist.append, value+ } +attlist.append &= attribute n { xsd:IDREF } +# This instruction appends the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip, identified by the "n" attribute +out = element out { attlist.out, (mlu | lu | b | chunk | var)+ } +attlist.out &= attribute c { text }? +# 'out' is an output statement; it may output any sequence of +# clips, literal strings, literal tags, variables, and whitespace items +# (see below) +modify-case = + element modify-case { attlist.modify-case, container, stringvalue } +attlist.modify-case &= empty +# The first argument of 'modify-case' copy the case of the second +# argument. +call-macro = element call-macro { attlist.call-macro, with-param* } +attlist.call-macro &= attribute n { xsd:IDREF } +# A macro may be called anywhere by name with one or more +# arguments +with-param = element with-param { attlist.with-param, empty } +attlist.with-param &= attribute pos { text } +# The attribute pos in each argument is used to refer to a lexical +# form in the current rule. For example, if a 2-parameter macro +# has been defined to perform noun-adjective agreement operations, +# it may be used with arguments 1 and 2 in a noun-adjective rule, +# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with +# arguments 1 and 3 in a noun-adverb-adjective rule, and with +# arguments 2 and 1 in an adjective-noun rule +clip = element clip { attlist.clip, empty } +attlist.clip &= + attribute pos { text }, + attribute side { "sl" | "tl" }, + attribute part { text }, + attribute queue { text }?, + attribute link-to { text }?, + attribute c { text }? +# A 'clip' is a substring of a source-language or target-language +# lexical form, extracted according to an attribute: +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * 'side' is used to select a source-language ('sl') or a +# target-language ('tl') clip +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +# +# * the value of 'queue' may be 'no' or 'yes'. 'yes' is assumed by +# default. +# +# * 'link-to' causes the other attributes to be ignored in clip evaluation +# when using 'clip' as a right hand side element (as value), and +# returns its value. When using as a left hand side (as reference), +# the value of the 'as' attribute is ignored. +lit = element lit { attlist.lit, empty } +attlist.lit &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +lit-tag = element lit-tag { attlist.lit-tag, empty } +attlist.lit-tag &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +var = element var { attlist.var, empty } +attlist.var &= attribute n { xsd:IDREF } +# Each 'var' is a variable identifier: the attribute n is the name +# of the variable. When it is in an 'out', a 'test', or the right +# part of a 'let', it represents the value of the variable; when in +# the left part of a 'let' it represents the reference of the +# variable. +get-case-from = + element get-case-from { attlist.get-case-from, (clip | lit | var) } +attlist.get-case-from &= attribute pos { text } +# Atención, falta modificar todos los comentarios donde intervenga +# get-case-from +case-of = element case-of { attlist.case-of, empty } +attlist.case-of &= + attribute pos { text }, + attribute side { "sl" | "tl" }, + attribute part { text } +# A 'case-of' is a value representing the case of a "clip". This value +# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA", +# (all uppercase). +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * 'side' is used to select a source-language ('sl') or a +# target-language ('tl') clip +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +concat = element concat { attlist.concat, value+ } +attlist.concat &= empty +# Concatenates a sequence of values +mlu = element mlu { attlist.mlu, lu+ } +attlist.mlu &= empty +# Encloses a multiword +lu = element lu { attlist.lu, value+ } +attlist.lu &= empty +# Encloses a word inside an 'out' element. +reject-current-rule = + element reject-current-rule { attlist.reject-current-rule, empty } +attlist.reject-current-rule &= attribute shifting { "yes" | "no" }? +# This instruction cancels the execution of the rule being processed. +# If "shifting" is set to "yes" or is not specified, the matching process +# consumes exactly one word at the input. If "shifting" is set to "no" +# then marks the rule to not to be considered in the current matching +# until the input buffer advances at least one single word +chunk = element chunk { attlist.chunk, tags, (mlu | lu | b | var)+ } +attlist.chunk &= + attribute name { text }?, + attribute namefrom { text }?, + attribute case { text }?, + attribute c { text }? +# Encloses a chunk inside an 'out' element. +# * 'name' the pseudolemma of the chunk. +# * 'namefrom' get the name from a variable. +# * 'case' the variable to get the uppercase/lowercase policy +# to apply it to the chunk name +tags = element tags { attlist.tags, tag+ } +attlist.tags &= empty +tag = element tag { attlist.tag, value } +attlist.tag &= empty +b = element b { attlist.b, empty } +attlist.b &= attribute pos { text }? +start = transfer +# 'b' is a [super]blanks item, indexed by pos; for example, a 'b' +# with pos="2" refers to the [super]blanks (including format data +# encapsulated by the de-formatter) between lexical form 2 and +# lexical form 3. Managing [super]blanks explicitly allows for the +# correct placement of format when the result of structural +# transfer has more or less lexical items than the original or has +# been reordered in some way. If attribute "pos" is not specified, then +# a single blank (ASCII 32) is generated. Index: branches/apertium-tagger/apertium2/apertium/transfer.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.rng (revision 69632) @@ -0,0 +1,1104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lu + chunk + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + sl + tl + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + sl + tl + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + yes + no + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/apertium-transfer.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-transfer.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-transfer.1 (revision 69632) @@ -0,0 +1,80 @@ +.TH apertium-transfer 1 2006-03-08 "" "" +.SH NAME +apertium-transfer \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-transfer +[\-n] trules preproc biltrans [input [output]] +.PP +.B apertium-transfer +trules preproc [input [output]] +.PP +.B apertium-transfer +\-x extended trules preproc biltrans [input [output]] +.PP +.B apertium-transfer +\-c trules preproc biltrans [input [output]] +.PP +.B apertium-transfer +\-t trules preproc biltrans [input [output]] +.SH DESCRIPTION +.BR apertium-transfer +is the program that performs the transfer from input language +into output language. Normally this program will not be used independently, but in combination with other programs: +.PP +.RE +.SH FILES +These are the five files that can be used with this command: +.PP +.B trules +Transfer rules file +.PP +.B preproc +Result of preprocess trules file +.PP +.B biltrans +Bilingual letter transducer file +.PP +.B infile +Input file (stdin by default). +.PP +.B outfile +Output file (stdout by default). +.PP +\-.B \-b +\-input from lexical transfer (single level transfer only) +\-.PP +\-.B \-h +\-shows this message +\-.PP +.B -n +Do not use a bilingual dictionary to process the input. +.PP +.B -x bindix +extended mode with user dictionary +.PP +.B -c +case-sensitiveness while accessing bilingual dictionary +.PP +.B -t +trace mode: show rule numbers and matched content +.PP +.B -T +extended trace mode, for use with apertium-transfer-tools +.PP +.B -z +null-flushing output on +.PP +.SH SEE ALSO +.I apertium \fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/transfer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.cc (revision 69632) @@ -0,0 +1,2346 @@ +/* + * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace Apertium; +using namespace std; + +void +Transfer::destroy() +{ + if(me) + { + delete me; + me = NULL; + } + if(doc) + { + xmlFreeDoc(doc); + doc = NULL; + } +} + +Transfer::Transfer() : +word(0), +blank(0), +lword(0), +lblank(0), +output(0), +any_char(0), +any_tag(0), +nwords(0) +{ + me = NULL; + doc = NULL; + root_element = NULL; + lastrule = NULL; + defaultAttrs = lu; + useBilingual = true; + preBilingual = false; + isExtended = false; + null_flush = false; + internal_null_flush = false; + trace = false; + trace_att = false; + emptyblank = ""; +} + +Transfer::~Transfer() +{ + destroy(); +} + +void +Transfer::readData(FILE *in) +{ + alphabet.read(in); + any_char = alphabet(TRXReader::ANY_CHAR); + any_tag = alphabet(TRXReader::ANY_TAG); + + Transducer t; + t.read(in, alphabet.size()); + + map finals; + + // finals + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + int key = Compression::multibyte_read(in); + finals[key] = Compression::multibyte_read(in); + } + + me = new MatchExe(t, finals); + + // attr_items + bool recompile_attrs = Compression::string_read(in) != string(pcre_version()); + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + attr_items[cad_k].read(in); + wstring fallback = Compression::wstring_read(in); + if(recompile_attrs) { + attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + } + } + + // variables + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + } + + // macros + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + macros[cad_k] = Compression::multibyte_read(in); + } + + // lists + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + + for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) + { + wstring const cad_v = Compression::wstring_read(in); + lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); + listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + } + } +} + +void +Transfer::readBil(string const &fstfile) +{ + FILE *in = fopen(fstfile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open file '" << fstfile << "'." << endl; + exit(EXIT_FAILURE); + } + fstp.load(in); + fstp.initBiltrans(); + fclose(in); +} + +void +Transfer::setExtendedDictionary(string const &fstfile) +{ + FILE *in = fopen(fstfile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open extended dictionary file '" << fstfile << "'." << endl; + exit(EXIT_FAILURE); + } + extended.load(in); + extended.initBiltrans(); + fclose(in); + isExtended = true; +} + +void +Transfer::read(string const &transferfile, string const &datafile, + string const &fstfile) +{ + readTransfer(transferfile); + + // datafile + FILE *in = fopen(datafile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open file '" << datafile << "'." << endl; + exit(EXIT_FAILURE); + } + readData(in); + fclose(in); + + if(fstfile != "") + { + readBil(fstfile); + } +} + +void +Transfer::readTransfer(string const &in) +{ + doc = xmlReadFile(in.c_str(), NULL, 0); + + if(doc == NULL) + { + cerr << "Error: Could not parse file '" << in << "'." << endl; + exit(EXIT_FAILURE); + } + + root_element = xmlDocGetRootElement(doc); + + // search for root element attributes + for(xmlAttr *i = root_element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "default")) + { + if(!xmlStrcmp(i->children->content, (const xmlChar *) "chunk")) + { + defaultAttrs = chunk; + } + else + { + defaultAttrs = lu; // default value for 'default' + } + } + } + + // search for macros & rules + for(xmlNode *i = root_element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) + { + collectMacros(i); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) + { + collectRules(i); + } + } + } +} + +void +Transfer::collectRules(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + for(xmlNode *j = i->children; ; j = j->next) + { + if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "action")) + { + rule_map.push_back(j); + break; + } + } + } + } +} + +void +Transfer::collectMacros(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + macro_map.push_back(i); + } + } +} + +bool +Transfer::checkIndex(xmlNode *element, int index, int limit) +{ + if(index >= limit) + { + wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) <line << endl; + return false; + } + return true; +} + + +string +Transfer::evalString(xmlNode *element) +{ + map::iterator it; + it = evalStringCache.find(element); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_clip_sl: + if(checkIndex(element, ti.getPos(), lword)) + { + return word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()); + } + break; + + case ti_clip_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()); + } + break; + + case ti_linkto_sl: + if(checkIndex(element, ti.getPos(), lword)) + { + if(word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()) != "") + { + return "<" + string((char *) ti.getPointer()) + ">"; + } + else + { + return ""; + } + } + break; + + case ti_linkto_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + if(word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()) != "") + { + return "<" + string((char *) ti.getPointer()) + ">"; + } + else + { + return ""; + } + } + break; + + case ti_var: + return variables[ti.getContent()]; + + case ti_lit_tag: + case ti_lit: + return ti.getContent(); + + case ti_b: + if(checkIndex(element, ti.getPos(), lblank)) + { + if(ti.getPos() >= 0) + { + return !blank?"":*(blank[ti.getPos()]); + } + return " "; + } + break; + + case ti_get_case_from: + if(checkIndex(element, ti.getPos(), lword)) + { + return copycase(word[ti.getPos()]->source(attr_items[ti.getContent()]), + evalString((xmlNode *) ti.getPointer())); + } + break; + + case ti_case_of_sl: + if(checkIndex(element, ti.getPos(), lword)) + { + return caseOf(word[ti.getPos()]->source(attr_items[ti.getContent()])); + } + break; + + case ti_case_of_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return caseOf(word[ti.getPos()]->target(attr_items[ti.getContent()])); + } + break; + + default: + return ""; + } + return ""; + } + + if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL, *side = NULL, *as = NULL; + bool queue = true; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) + { + side = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *)i->children->content) - 1; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) + { + if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) + { + queue = false; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) + { + as = i->children->content; + } + } + + if(as != NULL) + { + if(!xmlStrcmp(side, (const xmlChar *) "sl")) + { + evalStringCache[element] = TransferInstr(ti_linkto_sl, (const char *) part, pos, (void *) as, queue); + } + else + { + evalStringCache[element] = TransferInstr(ti_linkto_tl, (const char *) part, pos, (void *) as, queue); + } + } + else if(!xmlStrcmp(side, (const xmlChar *) "sl")) + { + evalStringCache[element] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue); + } + else + { + evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL, queue); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) + { + evalStringCache[element] = TransferInstr(ti_lit_tag, + tags((const char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) + { + evalStringCache[element] = TransferInstr(ti_lit, string((char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) + { + if(element->properties == NULL) + { + evalStringCache[element] = TransferInstr(ti_b, " ", -1); + } + else + { + int pos = atoi((const char *) element->properties->children->content) - 1; + evalStringCache[element] = TransferInstr(ti_b, "", pos); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) + { + int pos = atoi((const char *) element->properties->children->content) - 1; + xmlNode *param = NULL; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + param = i; + break; + } + } + + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) + { + evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) + { + int pos = 0; + xmlChar *part = NULL, *side = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) + { + side = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + } + + if(!xmlStrcmp(side, (const xmlChar *) "sl")) + { + evalStringCache[element] = TransferInstr(ti_case_of_sl, (const char *) part, pos); + } + else + { + evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) + { + string value; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + value.append(evalString(i)); + } + } + return value; + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + myword.append(evalString(i)); + } + } + + if(myword != "") + { + return "^"+myword+"$"; + } + else + { + return ""; + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu")) + { + string value; + + bool first_time = true; + + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + string myword; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + + if(!first_time) + { + if(myword != "" && myword[0] != '#') //'+#' problem + { + value.append("+"); + } + } + else + { + if(myword != "") + { + first_time = false; + } + } + + value.append(myword); + } + } + + if(value != "") + { + return "^"+value+"$"; + } + else + { + return ""; + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "chunk")) + { + return processChunk(element); + } + else + { + cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + } + + return evalString(element); +} + +void +Transfer::processOut(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(defaultAttrs == lu) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + if(myword != "") + { + fputwc_unlocked(L'^', output); + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); + fputwc_unlocked(L'$', output); + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) + { + fputwc_unlocked('^', output); + bool first_time = true; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + string myword; + for(xmlNode *k = j->children; k != NULL; k = k->next) + { + if(k->type == XML_ELEMENT_NODE) + { + myword.append(evalString(k)); + } + } + + if(!first_time) + { + if(myword != "" && myword[0] != '#') //'+#' problem + { + fputwc_unlocked(L'+', output); + } + } + else + { + if(myword != "") + { + first_time = false; + } + } + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); + } + } + fputwc_unlocked(L'$', output); + } + else // 'b' + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), + output); + } + } + else + { + if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) + { + fputws_unlocked(UtfConverter::fromUtf8(processChunk(i)).c_str(), output); + } + else // 'b' + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); + } + } + } + } +} + +string +Transfer::processChunk(xmlNode *localroot) +{ + string name, namefrom; + string caseofchunk = "aa"; + string result; + + + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "name")) + { + name = (const char *) i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "namefrom")) + { + namefrom = (const char *) i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "case")) + { + caseofchunk = (const char *) i->children->content; + } + } + + result.append("^"); + if(caseofchunk != "") + { + if(name != "") + { + result.append(copycase(variables[caseofchunk], name)); + } + else if(namefrom != "") + { + result.append(copycase(variables[caseofchunk], variables[namefrom])); + } + else + { + cerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; + exit(EXIT_FAILURE); + } + } + else + { + if(name != "") + { + result.append(name); + } + else if(namefrom != "") + { + result.append(variables[namefrom]); + } + else + { + cerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; + exit(EXIT_FAILURE); + } + } + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "tags")) + { + result.append(processTags(i)); + result.append("{"); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + if(myword != "") + { + result.append("^"); + result.append(myword); + result.append("$"); + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) + { + bool first_time = true; + string myword; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + string mylocalword; + if(j->type == XML_ELEMENT_NODE) + { + for(xmlNode *k = j->children; k != NULL; k = k->next) + { + if(k->type == XML_ELEMENT_NODE) + { + mylocalword.append(evalString(k)); + } + } + + if(!first_time) + { + if(mylocalword != "" && mylocalword[0] != '#') // '+#' problem + { + myword += '+'; + } + } + else + { + first_time = false; + } + } + myword.append(mylocalword); + } + if(myword != "") + { + result.append("^"); + result.append(myword); + result.append("$"); + } + } + else // 'b' + { + result.append(evalString(i)); + } + } + } + result.append("}$"); + return result; +} + +string +Transfer::processTags(xmlNode *localroot) +{ + string result; + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (xmlChar const *) "tag")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + result.append(evalString(j)); + } + } + } + } + } + return result; +} + +int +Transfer::processInstruction(xmlNode *localroot) +{ + int words_to_consume = -1; + if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) + { + words_to_consume = processChoose(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) + { + processLet(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) + { + processAppend(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) + { + processOut(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) + { + processCallMacro(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) + { + processModifyCase(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "reject-current-rule")) + { + words_to_consume = processRejectCurrentRule(localroot); + } + return words_to_consume; +} + +int +Transfer::processRejectCurrentRule(xmlNode *localroot) +{ + bool shifting = true; + string value; + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "shifting")) + { + value = (char *) i->children->content; + break; + } + } + + if(value == "no") + { + shifting = false; + } + + return shifting ? 1 : 0; +} + +void +Transfer::processLet(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + map::iterator it = evalStringCache.find(leftSide); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_var: + variables[ti.getContent()] = evalString(rightSide); + return; + + case ti_clip_sl: + word[ti.getPos()]->setSource(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); + return; + + case ti_clip_tl: + word[ti.getPos()]->setTarget(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); + return; + + default: + return; + } + } + if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = evalString(rightSide); + evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL, *side = NULL, *as = NULL; + bool queue = true; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) + { + side = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) + { + if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) + { + queue = false; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) + { + as = i->children->content; + } + } + + if(!xmlStrcmp(side, (const xmlChar *) "tl")) + { + word[pos]->setTarget(attr_items[(const char *) part], evalString(rightSide), queue); + evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL, queue); + } + else + { + word[pos]->setSource(attr_items[(const char *) part], evalString(rightSide), queue); + evalStringCache[leftSide] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue); + } + } +} + +void +Transfer::processAppend(xmlNode *localroot) +{ + string name; + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "n")) + { + name = (char *) i->children->content; + break; + } + } + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + variables[name].append(evalString(i)); + } + } +} + +void +Transfer::processModifyCase(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL, *side = NULL, *as = NULL; + bool queue = true; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) + { + side = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) + { + if(!xmlStrcmp(i->children->content, (xmlChar const *) "no")) + { + queue = false; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) + { + as = i->children->content; + (void)as; // ToDo, remove "as" and the whole else? + } + } + if(!xmlStrcmp(side, (const xmlChar *) "sl")) + { + string const result = copycase(evalString(rightSide), + word[pos]->source(attr_items[(const char *) part], queue)); + word[pos]->setSource(attr_items[(const char *) part], result); + } + else + { + string const result = copycase(evalString(rightSide), + word[pos]->target(attr_items[(const char *) part], queue)); + word[pos]->setTarget(attr_items[(const char *) part], result); + } + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = copycase(evalString(rightSide), variables[val]); + } +} + +void +Transfer::processCallMacro(xmlNode *localroot) +{ + string const n = (const char *) localroot->properties->children->content; + int npar = 0; + + xmlNode *macro = macro_map[macros[n]]; + + for(xmlAttr *i = macro->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "npar")) + { + npar = atoi((const char *) i->children->content); + break; + } + } + + // ToDo: Is it at all valid if npar <= 0 ? + + TransferWord **myword = NULL; + if(npar > 0) + { + myword = new TransferWord *[npar]; + } + string **myblank = NULL; + if(npar > 0) + { + myblank = new string *[npar]; + myblank[npar-1] = &emptyblank; + } + + int idx = 0; + int lastpos = 0; + for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + int pos = atoi((const char *) i->properties->children->content)-1; + myword[idx] = word[pos]; + if(idx-1 >= 0) + { + myblank[idx-1] = blank[lastpos]; + } + idx++; + lastpos = pos; + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + for(xmlNode *i = macro->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + processInstruction(i); + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + delete[] myword; + delete[] myblank; +} + +int +Transfer::processChoose(xmlNode *localroot) +{ + int words_to_consume = -1; + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "when")) + { + bool picked_option = false; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(j->name, (const xmlChar *) "test")) + { + if(!processTest(j)) + { + break; + } + else + { + picked_option = true; + } + } + else + { + words_to_consume = processInstruction(j); + if(words_to_consume != -1) + { + return words_to_consume; + } + } + } + } + if(picked_option) + { + return words_to_consume; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + words_to_consume = processInstruction(j); + if(words_to_consume != -1) + { + return words_to_consume; + } + } + } + } + } + } + return words_to_consume; +} + +bool +Transfer::processLogical(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) + { + return processEqual(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) + { + return processBeginsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) + { + return processBeginsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) + { + return processEndsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) + { + return processEndsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) + { + return processContainsSubstring(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + { + return processOr(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + { + return processAnd(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) + { + return processNot(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) + { + return processIn(localroot); + } + + return false; +} + +bool +Transfer::processIn(xmlNode *localroot) +{ + xmlNode *value = NULL; + xmlChar *idlist = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(value == NULL) + { + value = i; + } + else + { + idlist = i->properties->children->content; + break; + } + } + } + + string sval = evalString(value); + + if(localroot->properties != NULL) + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + set &myset = listslow[(const char *) idlist]; + if(myset.find(tolower(sval)) != myset.end()) + { + return true; + } + else + { + return false; + } + } + } + + set &myset = lists[(const char *) idlist]; + if(myset.find(sval) != myset.end()) + { + return true; + } + else + { + return false; + } +} + +bool +Transfer::processTest(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return processLogical(i); + } + } + return false; +} + +bool +Transfer::processAnd(xmlNode *localroot) +{ + bool val = true; + for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val && processLogical(i); + } + } + + return val; +} + +bool +Transfer::processOr(xmlNode *localroot) +{ + bool val = false; + for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val || processLogical(i); + } + } + + return val; +} + +bool +Transfer::processNot(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return !processLogical(i); + } + } + return false; +} + +bool +Transfer::processEqual(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first) == evalString(second); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)) == tolower(evalString(second)); + } + else + { + return evalString(first) == evalString(second); + } + } +} + +bool +Transfer::beginsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = 0; i != limit; i++) + { + if(s1[i] != s2[i]) + { + return false; + } + } + + return true; +} + +bool +Transfer::endsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) + { + if(s1[j] != s2[i]) + { + return false; + } + } + + return true; +} + + +bool +Transfer::processBeginsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return beginsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return beginsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return beginsWith(evalString(first), evalString(second)); + } + } +} + +bool +Transfer::processEndsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return endsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return endsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return endsWith(evalString(first), evalString(second)); + } + } +} + +bool +Transfer::processBeginsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(beginsWith(needle, *it)) + { + return true; + } + } + return false; +} + + +bool +Transfer::processEndsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(endsWith(needle, *it)) + { + return true; + } + } + return false; +} + +bool +Transfer::processContainsSubstring(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first).find(evalString(second)) != string::npos; + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; + } + else + { + return evalString(first).find(evalString(second)) != string::npos; + } + } +} + +string +Transfer::copycase(string const &source_word, string const &target_word) +{ + wstring result; + wstring const s_word = UtfConverter::fromUtf8(source_word); + wstring const t_word = UtfConverter::fromUtf8(target_word); + + bool firstupper = iswupper(s_word[0]); + bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); + bool sizeone = s_word.size() == 1; + + if(!uppercase || (sizeone && uppercase)) + { + result = t_word; + result[0] = towlower(result[0]); + //result = StringUtils::tolower(t_word); + } + else + { + result = StringUtils::toupper(t_word); + } + + if(firstupper) + { + result[0] = towupper(result[0]); + } + + return UtfConverter::toUtf8(result); +} + +string +Transfer::caseOf(string const &str) +{ + wstring const s = UtfConverter::fromUtf8(str); + + if(s.size() > 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else if(!iswupper(s[s.size()-1])) + { + return "Aa"; + } + else + { + return "AA"; + } + } + else if(s.size() == 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else + { + return "Aa"; + } + } + else + { + return "aa"; + } +} + +string +Transfer::tolower(string const &str) const +{ + return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); +} + +string +Transfer::tags(string const &str) const +{ + string result = "<"; + + for(unsigned int i = 0, limit = str.size(); i != limit; i++) + { + if(str[i] == '.') + { + result.append("><"); + } + else + { + result += str[i]; + } + } + + result += '>'; + + return result; +} + +int +Transfer::processRule(xmlNode *localroot) +{ + int instruction_return, words_to_consume = -1; + // localroot is suposed to be an 'action' tag + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + instruction_return = processInstruction(i); + // When an instruction which modifies the number of words to be consumed + // from the input is found, execution of the rule is stopped + if(instruction_return != -1) + { + words_to_consume = instruction_return; + break; + } + } + } + return words_to_consume; +} + +TransferToken & +Transfer::readToken(FILE *in) +{ + if(!input_buffer.isEmpty()) + { + return input_buffer.next(); + } + + wstring content; + while(true) + { + int val = fgetwc_unlocked(in); + if(feof(in) || (val == 0 && internal_null_flush)) + { + return input_buffer.add(TransferToken(content, tt_eof)); + } + if(val == '\\') + { + content += L'\\'; + content += (wchar_t) fgetwc_unlocked(in); + } + else if(val == L'[') + { + content += L'['; + while(true) + { + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L']') + { + content += L']'; + break; + } + else + { + content += wchar_t(val2); + } + } + } + else if(val == L'$') + { + return input_buffer.add(TransferToken(content, tt_word)); + } + else if(val == L'^') + { + return input_buffer.add(TransferToken(content, tt_blank)); + } + else if(val == L'\0' && null_flush) + { + fflush(output); + } + else + { + content += wchar_t(val); + } + } +} + +bool +Transfer::getNullFlush(void) +{ + return null_flush; +} + +void +Transfer::setNullFlush(bool null_flush) +{ + this->null_flush = null_flush; +} + +void +Transfer::setTrace(bool trace) +{ + this->trace = trace; +} + +void +Transfer::setTraceATT(bool trace) +{ + this->trace_att = trace; +} + +void +Transfer::transfer_wrapper_null_flush(FILE *in, FILE *out) +{ + null_flush = false; + internal_null_flush = true; + + while(!feof(in)) + { + transfer(in, out); + fputwc_unlocked(L'\0', out); + int code = fflush(out); + if(code != 0) + { + wcerr << L"Could not flush output " << errno << endl; + } + } + + internal_null_flush = false; + null_flush = true; +} + +void +Transfer::transfer(FILE *in, FILE *out) +{ + if(getNullFlush()) + { + transfer_wrapper_null_flush(in, out); + } + + int last = 0; + int prev_last = 0; + int lastrule_id = -1; + set banned_rules; + + output = out; + ms.init(me->getInitial()); + + while(true) + { + if(trace_att) + { + cerr << "Loop start " << endl; + cerr << "ms.size: " << ms.size() << endl; + + cerr << "tmpword.size(): " << tmpword.size() << endl; + for (unsigned int ind = 0; ind < tmpword.size(); ind++) + { + if(ind != 0) + { + wcerr << L" "; + } + wcerr << *tmpword[ind]; + } + wcerr << endl; + + cerr << "tmpblank.size(): " << tmpblank.size() << endl; + for (unsigned int ind = 0; ind < tmpblank.size(); ind++) + { + wcerr << L"'"; + wcerr << *tmpblank[ind]; + wcerr << L"' "; + } + wcerr << endl; + + cerr << "last: " << last << endl; + cerr << "prev_last: " << prev_last << endl << endl; + } + + if(ms.size() == 0) + { + if(lastrule != NULL) + { + int num_words_to_consume = applyRule(); + + if(trace_att) + { + cerr << "num_words_to_consume: " << num_words_to_consume << endl; + } + + //Consume all the words from the input which matched the rule. + //This piece of code is executed unless the rule contains a "reject-current-rule" instruction + if(num_words_to_consume < 0) + { + banned_rules.clear(); + input_buffer.setPos(last); + } + else if(num_words_to_consume > 0) + { + banned_rules.clear(); + if(prev_last >= input_buffer.getSize()) + { + input_buffer.setPos(0); + } + else + { + input_buffer.setPos(prev_last+1); + } + int num_consumed_words = 0; + while(num_consumed_words < num_words_to_consume) + { + TransferToken& local_tt = input_buffer.next(); + if (local_tt.getType() == tt_word) + { + num_consumed_words++; + } + } + } + else + { + //Add rule to banned rules + banned_rules.insert(lastrule_id); + input_buffer.setPos(prev_last); + input_buffer.next(); + last = input_buffer.getPos(); + } + lastrule_id = -1; + } + else + { + if(tmpword.size() != 0) + { + if(trace_att) + { + cerr << "printing tmpword[0]" < tr; + if(useBilingual && preBilingual == false) + { + if(isExtended && (*tmpword[0])[0] == L'*') + { + tr = extended.biltransWithQueue((*tmpword[0]).substr(1), false); + if(tr.first[0] == L'@') + { + tr.first[0] = L'*'; + } + else + { + tr.first = L"%" + tr.first; + } + } + else + { + tr = fstp.biltransWithQueue(*tmpword[0], false); + } + } + else if(preBilingual) + { + wstring sl; + wstring tl; + int seenSlash = 0; + for(wstring::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++) + { + if(*it == L'\\') + { + if(seenSlash == 0) + { + sl.push_back(*it); + it++; + sl.push_back(*it); + } + else + { + tl.push_back(*it); + it++; + tl.push_back(*it); + } + continue; + } + else if(*it == L'/') + { + seenSlash++; + continue; + } + if(seenSlash == 0) + { + sl.push_back(*it); + } + else if(seenSlash == 1) + { + tl.push_back(*it); + } + else if(seenSlash > 1) + { + break; + } + } + //tmpword[0]->assign(sl); + tr = pair(tl, false); + //wcerr << L"pb: " << *tmpword[0] << L" :: " << sl << L" >> " << tl << endl ; + } + else + { + tr = pair(*tmpword[0], 0); + } + + if(tr.first.size() != 0) + { + if(defaultAttrs == lu) + { + fputwc_unlocked(L'^', output); + fputws_unlocked(tr.first.c_str(), output); + fputwc_unlocked(L'$', output); + } + else + { + if(tr.first[0] == '*') + { + fputws_unlocked(L"^unknown{^", output); + } + else + { + fputws_unlocked(L"^default{^", output); + } + fputws_unlocked(tr.first.c_str(), output); + fputws_unlocked(L"$}$", output); + } + } + banned_rules.clear(); + tmpword.clear(); + input_buffer.setPos(last); + input_buffer.next(); + prev_last = last; + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + else if(tmpblank.size() != 0) + { + if(trace_att) + { + cerr << "printing tmpblank[0]" <c_str(), output); + tmpblank.clear(); + prev_last = last; + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + } + } + int val = ms.classifyFinals(me->getFinals(), banned_rules); + if(val != -1) + { + lastrule = rule_map[val-1]; + lastrule_id = val; + last = input_buffer.getPos(); + + if(trace) + { + wcerr << endl << L"apertium-transfer: Rule " << val << L" "; + for (unsigned int ind = 0; ind < tmpword.size(); ind++) + { + if (ind != 0) + { + wcerr << L" "; + } + wcerr << *tmpword[ind]; + } + wcerr << endl; + } + } + + TransferToken ¤t = readToken(in); + + switch(current.getType()) + { + case tt_word: + applyWord(current.getContent()); + tmpword.push_back(¤t.getContent()); + break; + + case tt_blank: + ms.step(L' '); + tmpblank.push_back(¤t.getContent()); + break; + + case tt_eof: + if(tmpword.size() != 0) + { + tmpblank.push_back(¤t.getContent()); + ms.clear(); + } + else + { + fputws_unlocked(current.getContent().c_str(), output); + return; + } + break; + + default: + cerr << "Error: Unknown input token." << endl; + return; + } + } +} + +int +Transfer::applyRule() +{ + int words_to_consume; + unsigned int limit = tmpword.size(); + //wcerr << L"applyRule: " << tmpword.size() << endl; + + for(unsigned int i = 0; i != limit; i++) + { + if(i == 0) + { + word = new TransferWord *[limit]; + lword = limit; + if(limit != 1) + { + blank = new string *[limit - 1]; + lblank = limit - 1; + } + else + { + blank = NULL; + lblank = 0; + } + } + else + { + blank[i-1] = new string(UtfConverter::toUtf8(*tmpblank[i-1])); + } + + pair tr; + if(useBilingual && preBilingual == false) + { + tr = fstp.biltransWithQueue(*tmpword[i], false); + } + else if(preBilingual) + { + //wcerr << "applyRule: " << *tmpword[i] << endl; + wstring sl; + wstring tl; + int seenSlash = 0; + for(wstring::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++) + { + if(*it == L'\\') + { + if(seenSlash == 0) + { + sl.push_back(*it); + it++; + sl.push_back(*it); + } + else + { + tl.push_back(*it); + it++; + tl.push_back(*it); + } + continue; + } + + if(*it == L'/') + { + seenSlash++; + continue; + } + if(seenSlash == 0) + { + sl.push_back(*it); + } + else if(seenSlash == 1) + { + tl.push_back(*it); + } + else if(seenSlash > 1) + { + break; + } + } + //tmpword[i]->assign(sl); + tr = pair(tl, false); + } + else + { + tr = pair(*tmpword[i], false); + } + + word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), + UtfConverter::toUtf8(tr.first), tr.second); + } + + words_to_consume = processRule(lastrule); + lastrule = NULL; + + if(word) + { + for(unsigned int i = 0; i != limit; i++) + { + delete word[i]; + } + delete[] word; + } + if(blank) + { + for(unsigned int i = 0; i != limit - 1; i++) + { + delete blank[i]; + } + delete[] blank; + } + word = NULL; + blank = NULL; + tmpword.clear(); + tmpblank.clear(); + ms.init(me->getInitial()); + return words_to_consume; +} + +/* HERE */ +void +Transfer::applyWord(wstring const &word_str) +{ + ms.step(L'^'); + for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) + { + switch(word_str[i]) + { + case L'\\': + i++; + ms.step(towlower(word_str[i]), any_char); + break; + + case L'/': + i = limit; + break; + + case L'<': + for(unsigned int j = i+1; j != limit; j++) + { + if(word_str[j] == L'>') + { + int symbol = alphabet(word_str.substr(i, j-i+1)); + if(symbol) + { + ms.step(symbol, any_tag); + } + else + { + ms.step(any_tag); + } + i = j; + break; + } + } + break; + + default: + ms.step(towlower(word_str[i]), any_char); + break; + } + } + ms.step(L'$'); +} + +void +Transfer::setPreBilingual(bool value) +{ + preBilingual = value; +} + +bool +Transfer::getPreBilingual(void) const +{ + return preBilingual; +} + +void +Transfer::setUseBilingual(bool value) +{ + useBilingual = value; +} + +bool +Transfer::getUseBilingual(void) const +{ + return useBilingual; +} + +void +Transfer::setCaseSensitiveness(bool value) +{ + fstp.setCaseSensitiveMode(value); +} Index: branches/apertium-tagger/apertium2/apertium/transfer.dtd =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.dtd (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.dtd (revision 69632) @@ -0,0 +1,489 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/transfer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.h (revision 69632) @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TRANSFER_ +#define _TRANSFER_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +class Transfer +{ +private: + + Alphabet alphabet; + MatchExe *me; + MatchState ms; + map attr_items; + map variables; + map macros; + map, Ltstr> lists; + map, Ltstr> listslow; + vector macro_map; + vector rule_map; + xmlDoc *doc; + xmlNode *root_element; + TransferWord **word; + string **blank; + int lword, lblank; + Buffer input_buffer; + vector tmpword; + vector tmpblank; + + FSTProcessor fstp; + FSTProcessor extended; + bool isExtended; + FILE *output; + int any_char; + int any_tag; + + xmlNode *lastrule; + unsigned int nwords; + + map evalStringCache; + + enum OutputType{lu,chunk}; + + OutputType defaultAttrs; + bool preBilingual; + bool useBilingual; + bool null_flush; + bool internal_null_flush; + bool trace; + bool trace_att; + string emptyblank; + + void destroy(); + void readData(FILE *input); + void readBil(string const &filename); + void readTransfer(string const &input); + void collectMacros(xmlNode *localroot); + void collectRules(xmlNode *localroot); + string caseOf(string const &str); + string copycase(string const &source_word, string const &target_word); + + void processLet(xmlNode *localroot); + void processAppend(xmlNode *localroot); + int processRejectCurrentRule(xmlNode *localroot); + void processOut(xmlNode *localroot); + void processCallMacro(xmlNode *localroot); + void processModifyCase(xmlNode *localroot); + bool processLogical(xmlNode *localroot); + bool processTest(xmlNode *localroot); + bool processAnd(xmlNode *localroot); + bool processOr(xmlNode *localroot); + bool processEqual(xmlNode *localroot); + bool processBeginsWith(xmlNode *localroot); + bool processBeginsWithList(xmlNode *localroot); + bool processEndsWith(xmlNode *localroot); + bool processEndsWithList(xmlNode *local); + bool processContainsSubstring(xmlNode *localroot); + bool processNot(xmlNode *localroot); + bool processIn(xmlNode *localroot); + int processRule(xmlNode *localroot); + string evalString(xmlNode *localroot); + int processInstruction(xmlNode *localroot); + int processChoose(xmlNode *localroot); + string processChunk(xmlNode *localroot); + string processTags(xmlNode *localroot); + + bool beginsWith(string const &str1, string const &str2) const; + bool endsWith(string const &str1, string const &str2) const; + string tolower(string const &str) const; + string tags(string const &str) const; + wstring readWord(FILE *in); + wstring readBlank(FILE *in); + wstring readUntil(FILE *in, int const symbol) const; + void applyWord(wstring const &word_str); + int applyRule(); + TransferToken & readToken(FILE *in); + bool checkIndex(xmlNode *element, int index, int limit); + void transfer_wrapper_null_flush(FILE *in, FILE *out); +public: + Transfer(); + ~Transfer(); + + void read(string const &transferfile, string const &datafile, + string const &fstfile = ""); + void transfer(FILE *in, FILE *out); + void setUseBilingual(bool value); + bool getUseBilingual(void) const; + void setPreBilingual(bool value); + bool getPreBilingual(void) const; + void setExtendedDictionary(string const &fstfile); + void setCaseSensitiveness(bool value); + bool getNullFlush(void); + void setNullFlush(bool null_flush); + void setTrace(bool trace); + void setTraceATT(bool trace); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/a.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/a.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/a.cc (revision 69632) @@ -0,0 +1,50 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "a.h" + +#include "analysis.h" +#include "exception.h" + +namespace Apertium { +bool operator==(const a &a_, const a &b_) { + return a_.TheTags == b_.TheTags && a_.TheMorphemes == b_.TheMorphemes; +} + +bool operator<(const a &a_, const a &b_) { + if (a_.TheTags == b_.TheTags) + return a_.TheMorphemes < b_.TheMorphemes; + + return a_.TheTags < b_.TheTags; +} + +a::a() : TheTags(), TheMorphemes() {} + +a::a(const Analysis &Analysis_) : TheTags(), TheMorphemes() { + if (Analysis_.TheMorphemes.empty()) + throw Exception::Analysis::TheMorphemes_empty("can't convert const " + "Analysis & comprising empty " + "Morpheme std::vector to a"); + + if (Analysis_.TheMorphemes.front().TheTags.empty()) + throw Exception::Morpheme::TheTags_empty("can't convert const Analysis & " + "comprising Morpheme comprising " + "empty Tag std::vector to a"); + + TheTags = Analysis_.TheMorphemes.front().TheTags; + TheMorphemes = std::vector(Analysis_.TheMorphemes.begin() + 1, + Analysis_.TheMorphemes.end()); +} +} Index: branches/apertium-tagger/apertium2/apertium/a.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/a.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/a.h (revision 69632) @@ -0,0 +1,37 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef A_H +#define A_H + +#include "analysis.h" +#include "morpheme.h" +#include "tag.h" + +#include + +namespace Apertium { +class a { +public: + friend bool operator==(const a &a_, const a &b_); + friend bool operator<(const a &a_, const a &b_); + a(); + a(const Analysis &Analysis_); + std::vector TheTags; + std::vector TheMorphemes; +}; +} + +#endif // A_H Index: branches/apertium-tagger/apertium2/apertium/align.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/align.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/align.cc (revision 69632) @@ -0,0 +1,56 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "align.h" + +#include "linebreak.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +void align::align_( + const std::vector > &string_) { + const std::streamsize width_ = col(string_) + 2; + + for (std::vector >::const_iterator i_ = + string_.begin(); + i_ != string_.end(); ++i_) { + std::cerr << " " << std::setw(width_) << std::left << i_->first + << std::setw(0) + << linebreak::linebreak_(i_->second, width_ + 2, width_ + 4) + << '\n'; + } +} + +std::string::size_type +align::col(const std::vector > &string_) { + std::string::size_type col_ = 0; + + for (std::vector >::const_iterator i_ = + string_.begin(); + i_ != string_.end(); ++i_) { + if (i_->first.size() > col_) + col_ = i_->first.size(); + } + + return col_; +} +} Index: branches/apertium-tagger/apertium2/apertium/align.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/align.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/align.h (revision 69632) @@ -0,0 +1,35 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef ALIGN_H +#define ALIGN_H + +#include +#include +#include + +namespace Apertium { +class align { +public: + static void + align_(const std::vector > &string_); + +private: + static std::string::size_type + col(const std::vector > &string_); +}; +} + +#endif // ALIGN_H Index: branches/apertium-tagger/apertium2/apertium/analysis.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/analysis.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/analysis.cc (revision 69632) @@ -0,0 +1,55 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "analysis.h" + +#include "exception.h" +#include "morpheme.h" + +#include +#include + +namespace Apertium { +std::wostream &operator<<(std::wostream &Stream_, const Analysis &Analysis_) { + Stream_ << static_cast(Analysis_); + return Stream_; +} + +bool operator==(const Analysis &a, const Analysis &b) { + return a.TheMorphemes == b.TheMorphemes; +} + +bool operator<(const Analysis &a, const Analysis &b) { + return a.TheMorphemes < b.TheMorphemes; +} + +Analysis::operator std::wstring() const { + if (TheMorphemes.empty()) + throw Exception::Analysis::TheMorphemes_empty( + "can't convert Analysis comprising empty Morpheme std::vector to " + "std::wstring"); + + std::vector::const_iterator Morpheme_ = TheMorphemes.begin(); + std::wstring wstring_ = *Morpheme_; + ++Morpheme_; + + // Call .end() each iteration to save memory. + for (; Morpheme_ != TheMorphemes.end(); ++Morpheme_) { + wstring_ += L"+" + static_cast(*Morpheme_); + } + + return wstring_; +} +} Index: branches/apertium-tagger/apertium2/apertium/analysis.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/analysis.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/analysis.h (revision 69632) @@ -0,0 +1,37 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef ANALYSIS_H +#define ANALYSIS_H + +#include "morpheme.h" + +#include +#include +#include + +namespace Apertium { +class Analysis { +public: + friend std::wostream &operator<<(std::wostream &Stream_, + const Analysis &Analysis_); + friend bool operator==(const Analysis &a, const Analysis &b); + friend bool operator<(const Analysis &a, const Analysis &b); + operator std::wstring() const; + std::vector TheMorphemes; +}; +} + +#endif // ANALYSIS_H Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.cc (revision 69632) @@ -0,0 +1,20 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_5_3_1_tagger.h" + +namespace Apertium { +basic_5_3_1_Tagger::basic_5_3_1_Tagger() {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.h (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_5_3_1_TAGGER_H +#define BASIC_5_3_1_TAGGER_H + +#include "analysis.h" + +#include +#include + +namespace Apertium { +class basic_5_3_1_Tagger { +protected: + basic_5_3_1_Tagger(); + std::map Model; +}; +} + +#endif // BASIC_5_3_1_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.cc (revision 69632) @@ -0,0 +1,20 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_5_3_2_tagger.h" + +namespace Apertium { +basic_5_3_2_Tagger::basic_5_3_2_Tagger() {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.h (revision 69632) @@ -0,0 +1,33 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_5_3_2_TAGGER_H +#define BASIC_5_3_2_TAGGER_H + +#include "a.h" +#include "lemma.h" + +#include +#include + +namespace Apertium { +class basic_5_3_2_Tagger { +protected: + basic_5_3_2_Tagger(); + std::map > Model; +}; +} + +#endif // BASIC_5_3_2_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_3_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_3_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_3_tagger.h (revision 69632) @@ -0,0 +1,35 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_5_3_3_TAGGER_H +#define BASIC_5_3_3_TAGGER_H + +#include "i.h" +#include "lemma.h" + +#include +#include +#include + +namespace Apertium { +class basic_5_3_3_Tagger { +protected: + std::pair >, + std::pair >, + std::map > > > Model; +}; +} + +#endif // BASIC_5_3_3_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/basic_exception_type.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_exception_type.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_exception_type.cc (revision 69632) @@ -0,0 +1,20 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_exception_type.h" + +namespace Apertium { +basic_ExceptionType::~basic_ExceptionType() throw() {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_exception_type.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_exception_type.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_exception_type.h (revision 69632) @@ -0,0 +1,29 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_EXCEPTION_TYPE_H +#define BASIC_EXCEPTION_TYPE_H + +#include + +namespace Apertium { +class basic_ExceptionType : public std::exception { +public: + virtual ~basic_ExceptionType() throw() = 0; + virtual const char *what() const throw() = 0; +}; +} + +#endif // BASIC_EXCEPTION_TYPE_H Index: branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.cc (revision 69632) @@ -0,0 +1,125 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_stream_tagger.h" + +#include "apertium_config.h" + +#include "basic_tagger.h" +#include "lexical_unit.h" +#include "stream.h" +#include "streamed_type.h" + +#include + +#if ENABLE_DEBUG + +#include +#include +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +basic_StreamTagger::~basic_StreamTagger() {} + +void basic_StreamTagger::tag(Stream &Input, std::wostream &Output) const { + while (true) { + StreamedType StreamedType_ = Input.get(); + Output << StreamedType_.TheString; + + if (!StreamedType_.TheLexicalUnit) { + if (!Input.flush_()) + break; + + Output << std::flush; + continue; + } + +#if ENABLE_DEBUG + + std::wcerr << L"\n\n"; + +#endif // ENABLE_DEBUG + + tag(*StreamedType_.TheLexicalUnit, Output); + + if (Input.flush_()) + Output << std::flush; + } +} + +basic_StreamTagger::basic_StreamTagger(const basic_Tagger::Flags &Flags_) + : basic_Tagger(Flags_) {} + +void basic_StreamTagger::tag(const LexicalUnit &LexicalUnit_, + std::wostream &Output) const { +#if ENABLE_DEBUG + + for (std::vector::const_iterator Analysis_ = + LexicalUnit_.TheAnalyses.begin(); + Analysis_ != LexicalUnit_.TheAnalyses.end(); ++Analysis_) { + std::wcerr << L"score(\"" << *Analysis_ << L"\") ==\n " + << score_DEBUG(*Analysis_) << L" ==\n " << std::fixed + << std::setprecision(std::numeric_limits::digits10) + << score(*Analysis_) << L"\n"; + } + +#endif // ENABLE_DEBUG + + Output << L"^"; + + if (LexicalUnit_.TheAnalyses.empty()) { + if (TheFlags.getShowSuperficial()) + Output << LexicalUnit_.TheSurfaceForm << L"/"; + + Output << L"*" << LexicalUnit_.TheSurfaceForm << L"$"; + return; + } + + if (TheFlags.getMark()) { + if (LexicalUnit_.TheAnalyses.size() != 1) + Output << L"="; + } + + if (TheFlags.getShowSuperficial()) + Output << LexicalUnit_.TheSurfaceForm << L"/"; + + std::vector::const_iterator TheAnalysis = + LexicalUnit_.TheAnalyses.begin(); + + for (std::vector::const_iterator Analysis_ = + LexicalUnit_.TheAnalyses.begin() + 1; + // Call .end() each iteration to save memory. + Analysis_ != LexicalUnit_.TheAnalyses.end(); ++Analysis_) { + if (score(*Analysis_) > score(*TheAnalysis)) + TheAnalysis = Analysis_; + } + + Output << *TheAnalysis; + + if (TheFlags.getFirst()) { + for (std::vector::const_iterator Analysis_ = + LexicalUnit_.TheAnalyses.begin(); + // Call .end() each iteration to save memory. + Analysis_ != LexicalUnit_.TheAnalyses.end(); ++Analysis_) { + if (Analysis_ != TheAnalysis) + Output << L"/" << *Analysis_; + } + } + + Output << L"$"; +} +} Index: branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.h (revision 69632) @@ -0,0 +1,56 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_STREAM_TAGGER_H +#define BASIC_STREAM_TAGGER_H + +#include "apertium_config.h" + +#include "basic_tagger.h" +#include "lexical_unit.h" +#include "stream.h" + +#include +#include + +#if ENABLE_DEBUG + +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +class basic_StreamTagger : protected basic_Tagger { +public: + virtual ~basic_StreamTagger(); + virtual void deserialise(std::istream &Serialised_basic_Tagger) = 0; + void tag(Stream &Input, std::wostream &Output) const; + +protected: + basic_StreamTagger(const Flags &Flags_); + virtual long double score(const Analysis &Analysis_) const = 0; + +#if ENABLE_DEBUG + + virtual std::wstring score_DEBUG(const Analysis &Analysis_) const = 0; + +#endif // ENABLE_DEBUG + +private: + void tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) const; +}; +} + +#endif // BASIC_STREAM_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.cc (revision 69632) @@ -0,0 +1,59 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_stream_tagger_trainer.h" + +#include "analysis.h" +#include "basic_tagger.h" +#include "exception.h" +#include "stream.h" +#include "streamed_type.h" + +namespace Apertium { +basic_StreamTaggerTrainer::~basic_StreamTaggerTrainer() {} + +void basic_StreamTaggerTrainer::train(Stream &TaggedCorpus) { + while (true) { + StreamedType StreamedType_ = TaggedCorpus.get(); + + if (!StreamedType_.TheLexicalUnit) + break; + + if (StreamedType_.TheLexicalUnit->TheAnalyses.empty()) + throw Exception::LexicalUnit::TheAnalyses_empty( + "can't train LexicalUnit comprising empty Analysis std::vector"); + + if (OccurrenceCoefficient % + StreamedType_.TheLexicalUnit->TheAnalyses.size() != + 0) { + OccurrenceCoefficient *= StreamedType_.TheLexicalUnit->TheAnalyses.size(); + multiplyModel(StreamedType_.TheLexicalUnit->TheAnalyses.size()); + } + + for (std::vector::const_iterator Analysis_ = + StreamedType_.TheLexicalUnit->TheAnalyses.begin(); + Analysis_ != StreamedType_.TheLexicalUnit->TheAnalyses.end(); + ++Analysis_) { + train_Analysis(*Analysis_, + OccurrenceCoefficient / + StreamedType_.TheLexicalUnit->TheAnalyses.size()); + } + } +} + +basic_StreamTaggerTrainer::basic_StreamTaggerTrainer( + const basic_Tagger::Flags &Flags_) + : basic_Tagger(Flags_), OccurrenceCoefficient(1) {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.h (revision 69632) @@ -0,0 +1,41 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_STREAM_TAGGER_TRAINER_H +#define BASIC_STREAM_TAGGER_TRAINER_H + +#include "basic_tagger.h" +#include "stream.h" + +#include + +namespace Apertium { +class basic_StreamTaggerTrainer : protected basic_Tagger { +public: + virtual ~basic_StreamTaggerTrainer(); + void train(Stream &TaggedCorpus); + virtual void serialise(std::ostream &Serialised_basic_Tagger) const = 0; + +protected: + basic_StreamTaggerTrainer(const Flags &Flags_); + virtual void train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_) = 0; + virtual void + multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier) = 0; + std::size_t OccurrenceCoefficient; +}; +} + +#endif // BASIC_STREAM_TAGGER_TRAINER_H Index: branches/apertium-tagger/apertium2/apertium/basic_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_tagger.cc (revision 69632) @@ -0,0 +1,48 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_tagger.h" + +namespace Apertium { +basic_Tagger::Flags::Flags() + : Debug(false), First(false), Mark(false), ShowSuperficial(false), + NullFlush(false) {} + +bool basic_Tagger::Flags::getDebug() const { return Debug; } + +void basic_Tagger::Flags::setDebug(const bool &Debug_) { Debug = Debug_; } + +bool basic_Tagger::Flags::getFirst() const { return First; } + +void basic_Tagger::Flags::setFirst(const bool &First_) { First = First_; } + +bool basic_Tagger::Flags::getMark() const { return Mark; } + +void basic_Tagger::Flags::setMark(const bool &Mark_) { Mark = Mark_; } + +bool basic_Tagger::Flags::getShowSuperficial() const { return ShowSuperficial; } + +void basic_Tagger::Flags::setShowSuperficial(const bool &ShowSuperficial_) { + ShowSuperficial = ShowSuperficial_; +} + +bool basic_Tagger::Flags::getNullFlush() const { return NullFlush; } + +void basic_Tagger::Flags::setNullFlush(const bool &NullFlush_) { + NullFlush = NullFlush_; +} + +basic_Tagger::basic_Tagger(const Flags &Flags_) : TheFlags(Flags_) {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_tagger.h (revision 69632) @@ -0,0 +1,60 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_TAGGER_H +#define BASIC_TAGGER_H + +namespace Apertium { +class basic_Tagger { +public: + class Flags { + public: + Flags(); + bool getDebug() const; + void setDebug(const bool &Debug_); + bool getFirst() const; + void setFirst(const bool &First_); + bool getMark() const; + void setMark(const bool &Mark_); + bool getShowSuperficial() const; + void setShowSuperficial(const bool &ShowSuperficial_); + bool getNullFlush() const; + void setNullFlush(const bool &NullFlush_); + static bool (Flags::*GetDebug)() const; + static void (Flags::*SetDebug)(const bool &); + static bool (Flags::*GetFirst)() const; + static void (Flags::*SetFirst)(const bool &); + static bool (Flags::*GetMark)() const; + static void (Flags::*SetMark)(const bool &); + static bool (Flags::*GetShowSuperficial)() const; + static void (Flags::*SetShowSuperficial)(const bool &); + static bool (Flags::*GetNullFlush)() const; + static void (Flags::*SetNullFlush)(const bool &); + + private: + bool Debug : 1; + bool First : 1; + bool Mark : 1; + bool ShowSuperficial : 1; + bool NullFlush : 1; + }; + +protected: + basic_Tagger(const Flags &Flags_); + Flags TheFlags; +}; +} + +#endif // BASIC_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/constructor_eq_delete.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/constructor_eq_delete.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/constructor_eq_delete.h (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef CONSTRUCTOR_EQ_DELETE_H +#define CONSTRUCTOR_EQ_DELETE_H + +namespace Apertium { +class constructor_eq_delete { +protected: + constructor_eq_delete() {} + ~constructor_eq_delete() {} + +private: + constructor_eq_delete(const constructor_eq_delete &constructor_eq_delete_); + constructor_eq_delete & + operator=(constructor_eq_delete constructor_eq_delete_); +}; +} + +#endif // CONSTRUCTOR_EQ_DELETE_H Index: branches/apertium-tagger/apertium2/apertium/deserialiser.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/deserialiser.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/deserialiser.h (revision 69632) @@ -0,0 +1,255 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef DESERIALISER_H +#define DESERIALISER_H + +#include "a.h" +#include "analysis.h" +#include "basic_exception_type.h" +#include "exception.h" +#include "i.h" +#include "lemma.h" +#include "morpheme.h" +#include "tag.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +template class Deserialiser; + +template <> class Deserialiser { +public: + inline static a deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static Analysis deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static i deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static Lemma deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static Morpheme deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static Tag deserialise(std::istream &Stream_); +}; + +template +class Deserialiser > { +public: + inline static std::basic_string + deserialise(std::istream &Stream_); +}; + +template +class Deserialiser > { +public: + inline static std::map + deserialise(std::istream &Stream_); +}; + +template +class Deserialiser > { +public: + inline static std::pair + deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static std::size_t deserialise(std::istream &Stream_); +}; + +template class Deserialiser > { +public: + inline static std::vector deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static wchar_t deserialise(std::istream &Stream_); +}; + +a Deserialiser::deserialise(std::istream &Stream_) { + a StreamedType_; + StreamedType_.TheTags = Deserialiser >::deserialise(Stream_); + StreamedType_.TheMorphemes = + Deserialiser >::deserialise(Stream_); + return StreamedType_; +} + +Analysis Deserialiser::deserialise(std::istream &Stream_) { + Analysis SerialisedType_; + SerialisedType_.TheMorphemes = + Deserialiser >::deserialise(Stream_); + return SerialisedType_; +} + +i Deserialiser::deserialise(std::istream &Stream_) { + i StreamedType_; + StreamedType_.TheTags = Deserialiser >::deserialise(Stream_); + return StreamedType_; +} + +Lemma Deserialiser::deserialise(std::istream &Stream_) { + Lemma StreamedType_; + StreamedType_.TheLemma = Deserialiser::deserialise(Stream_); + return StreamedType_; +} + +Morpheme Deserialiser::deserialise(std::istream &Stream_) { + Morpheme SerialisedType_; + SerialisedType_.TheLemma = Deserialiser::deserialise(Stream_); + SerialisedType_.TheTags = + Deserialiser >::deserialise(Stream_); + return SerialisedType_; +} + +Tag Deserialiser::deserialise(std::istream &Stream_) { + Tag SerialisedType_; + SerialisedType_.TheTag = Deserialiser::deserialise(Stream_); + return SerialisedType_; +} + +template +std::basic_string +Deserialiser >::deserialise( + std::istream &Stream_) { + std::size_t SerialisedValueCount = + Deserialiser::deserialise(Stream_); + std::basic_string SerialisedType_; + + for (; SerialisedValueCount != 0; --SerialisedValueCount) { + SerialisedType_.push_back(Deserialiser::deserialise(Stream_)); + } + + return SerialisedType_; +} + +template +std::map +Deserialiser >::deserialise( + std::istream &Stream_) { + std::size_t SerialisedValueCount = + Deserialiser::deserialise(Stream_); + std::map SerialisedType_; + + for (; SerialisedValueCount != 0; --SerialisedValueCount) { + SerialisedType_.insert( + Deserialiser >::deserialise(Stream_)); + } + + return SerialisedType_; +} + +template +std::pair +Deserialiser >::deserialise( + std::istream &Stream_) { + std::pair SerialisedType_; + SerialisedType_.first = Deserialiser::deserialise(Stream_); + SerialisedType_.second = Deserialiser::deserialise(Stream_); + return SerialisedType_; +} + +std::size_t Deserialiser::deserialise(std::istream &Stream_) { + try { + std::size_t SerialisedType_ = 0; + unsigned char SerialisedTypeSize = Stream_.get(); + + if (!Stream_) + throw Exception::Deserialiser::not_Stream_good("can't deserialise size"); + + for (; SerialisedTypeSize != 0;) { + SerialisedType_ += + static_cast(Stream_.get()) + << std::numeric_limits::digits * --SerialisedTypeSize; + + if (!Stream_) + throw Exception::Deserialiser::not_Stream_good( + "can't deserialise byte"); + } + + return SerialisedType_; + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't deserialise std::size_t: " << basic_ExceptionType_.what(); + throw Exception::Deserialiser::size_t_(what_); + } +} + +template +std::vector +Deserialiser >::deserialise(std::istream &Stream_) { + std::size_t SerialisedValueCount = + Deserialiser::deserialise(Stream_); + std::vector SerialisedType_; + + for (; SerialisedValueCount != 0; --SerialisedValueCount) { + SerialisedType_.push_back(Deserialiser::deserialise(Stream_)); + } + + return SerialisedType_; +} + +wchar_t Deserialiser::deserialise(std::istream &Stream_) { + try { + unsigned wchar_t SerialisedType_ = 0; + unsigned char SerialisedTypeSize = Stream_.get(); + + if (!Stream_) + throw Exception::Deserialiser::not_Stream_good("can't deserialise size"); + + for (; SerialisedTypeSize != 0;) { + SerialisedType_ += + static_cast(Stream_.get()) + << std::numeric_limits::digits * --SerialisedTypeSize; + + if (!Stream_) + throw Exception::Deserialiser::not_Stream_good( + "can't deserialise byte"); + } + + return static_cast(SerialisedType_); + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't deserialise wchar_t: " << basic_ExceptionType_.what(); + throw Exception::Deserialiser::wchar_t_(what_); + } +} +} + +#endif // DESERIALISER_H Index: branches/apertium-tagger/apertium2/apertium/err_exception.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/err_exception.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/err_exception.h (revision 69632) @@ -0,0 +1,23 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef ERR_EXCEPTION_H +#define ERR_EXCEPTION_H + +namespace Apertium { +class err_Exception {}; +} + +#endif // ERR_EXCEPTION_H Index: branches/apertium-tagger/apertium2/apertium/exception_type.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/exception_type.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/exception_type.cc (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "exception_type.h" + +#include +#include + +namespace Apertium { +ExceptionType::ExceptionType(const char *const what_) : what_(what_) {} + +ExceptionType::ExceptionType(const std::string &what_) : what_(what_) {} + +ExceptionType::ExceptionType(const std::stringstream &what_) + : what_(what_.str()) {} + +ExceptionType::~ExceptionType() throw() {} + +const char *ExceptionType::what() const throw() { return what_.c_str(); } +} Index: branches/apertium-tagger/apertium2/apertium/exception_type.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/exception_type.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/exception_type.h (revision 69632) @@ -0,0 +1,38 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef EXCEPTION_TYPE_H +#define EXCEPTION_TYPE_H + +#include "basic_exception_type.h" + +#include +#include + +namespace Apertium { +class ExceptionType : public basic_ExceptionType { +public: + ExceptionType(const char *const what_); + ExceptionType(const std::string &what_); + ExceptionType(const std::stringstream &what_); + virtual ~ExceptionType() throw() = 0; + const char *what() const throw(); + +protected: + const std::string what_; +}; +} + +#endif // EXCEPTION_TYPE_H Index: branches/apertium-tagger/apertium2/apertium/file_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/file_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/file_tagger.cc (revision 69632) @@ -0,0 +1,42 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "file_tagger.h" + +#include + +#include + +namespace Apertium { +FILE_Tagger::FILE_Tagger() : debug(false), show_sf(false), null_flush(false) {} + +FILE_Tagger::~FILE_Tagger() {} + +void FILE_Tagger::set_debug(const bool &Debug) { debug = Debug; } + +void FILE_Tagger::set_show_sf(const bool &ShowSuperficial) { + show_sf = ShowSuperficial; +} + +void FILE_Tagger::setNullFlush(const bool &NullFlush) { + null_flush = NullFlush; +} + +void FILE_Tagger::deserialise(char *const TaggerSpecificationFilename) { + TSXReader TaggerSpecificationReader_; + TaggerSpecificationReader_.read(TaggerSpecificationFilename); + deserialise(TaggerSpecificationReader_.getTaggerData()); +} +} Index: branches/apertium-tagger/apertium2/apertium/i.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/i.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/i.cc (revision 69632) @@ -0,0 +1,50 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "i.h" + +#include "analysis.h" +#include "exception.h" +#include "morpheme.h" + +namespace Apertium { +bool operator==(const i &a_, const i &b_) { return a_.TheTags == b_.TheTags; } + +bool operator<(const i &a_, const i &b_) { return a_.TheTags < b_.TheTags; } + +i::i() {} + +i::i(const Analysis &Analysis_) : TheTags() { + if (Analysis_.TheMorphemes.empty()) + throw Exception::Analysis::TheMorphemes_empty("can't convert const " + "Analysis & comprising empty " + "Morpheme std::vector to i"); + + if (Analysis_.TheMorphemes.front().TheTags.empty()) + throw Exception::Morpheme::TheTags_empty("can't convert const Analysis & " + "comprising Morpheme comprising " + "empty Tag std::vector to i"); + + TheTags = Analysis_.TheMorphemes.front().TheTags; +} + +i::i(const Morpheme &Morpheme_) : TheTags() { + if (Morpheme_.TheTags.empty()) + throw Exception::Morpheme::TheTags_empty( + "can't convert const Morpheme & comprising empty Tag std::vector to i"); + + TheTags = Morpheme_.TheTags; +} +} Index: branches/apertium-tagger/apertium2/apertium/i.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/i.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/i.h (revision 69632) @@ -0,0 +1,38 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef I_H +#define I_H + +#include "analysis.h" +#include "morpheme.h" +#include "tag.h" + +#include + +namespace Apertium { +class i { + friend bool operator==(const i &a_, const i &b_); + friend bool operator<(const i &a_, const i &b_); + +public: + i(); + i(const Analysis &Analysis_); + i(const Morpheme &Morpheme_); + std::vector TheTags; +}; +} + +#endif // I_H Index: branches/apertium-tagger/apertium2/apertium/lemma.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/lemma.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lemma.cc (revision 69632) @@ -0,0 +1,55 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "lemma.h" + +#include "analysis.h" +#include "exception.h" +#include "morpheme.h" + +namespace Apertium { +bool operator==(const Lemma &a_, const Lemma &b_) { + return a_.TheLemma == b_.TheLemma; +} + +bool operator<(const Lemma &a_, const Lemma &b_) { + return a_.TheLemma < b_.TheLemma; +} + +Lemma::Lemma() : TheLemma() {} + +Lemma::Lemma(const Analysis &Analysis_) : TheLemma() { + if (Analysis_.TheMorphemes.empty()) + throw Exception::Analysis::TheMorphemes_empty( + "can't convert const Analysis & comprising empty Morpheme std::vector " + "to Lemma"); + + if (Analysis_.TheMorphemes.front().TheLemma.empty()) + throw Exception::Morpheme::TheLemma_empty( + "can't convert const Analysis & comprising Morpheme comprising empty " + "Lemma std::wstring to Lemma"); + + TheLemma = Analysis_.TheMorphemes.front().TheLemma; +} + +Lemma::Lemma(const Morpheme &Morpheme_) : TheLemma() { + if (Morpheme_.TheLemma.empty()) + throw Exception::Morpheme::TheLemma_empty("can't convert const Morpheme & " + "comprising empty Lemma " + "std::wstring to Lemma"); + + TheLemma = Morpheme_.TheLemma; +} +} Index: branches/apertium-tagger/apertium2/apertium/lemma.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/lemma.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lemma.h (revision 69632) @@ -0,0 +1,36 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef LEMMA_H +#define LEMMA_H + +#include "analysis.h" +#include "morpheme.h" + +#include + +namespace Apertium { +class Lemma { +public: + friend bool operator==(const Lemma &a_, const Lemma &b_); + friend bool operator<(const Lemma &a_, const Lemma &b_); + Lemma(); + Lemma(const Analysis &Analysis_); + Lemma(const Morpheme &Morpheme_); + std::wstring TheLemma; +}; +} + +#endif // LEMMA_H Index: branches/apertium-tagger/apertium2/apertium/lexical_unit.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/lexical_unit.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lexical_unit.h (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef TAGGING_EXPRESSION_H +#define TAGGING_EXPRESSION_H + +#include "analysis.h" + +#include +#include + +namespace Apertium { +class LexicalUnit { +public: + std::wstring TheSurfaceForm; + std::vector TheAnalyses; +}; +} + +#endif // LEXICAL_UNIT_H Index: branches/apertium-tagger/apertium2/apertium/linebreak.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/linebreak.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/linebreak.cc (revision 69632) @@ -0,0 +1,94 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "linebreak.h" + +#include + +namespace Apertium { +std::string linebreak::linebreak_(std::string string_, + std::string::size_type col, + const std::string::size_type &wrapmargin) { + std::string::size_type i_ = 0; + + while (true) { + if (i_ == string_.size()) + return string_; + + if (col < 79) { + if (string_.at(i_) == '\n') { + if (i_ + 1 == string_.size()) { + string_.erase(i_, 1); + return string_; + } + + string_.insert(i_ + 1, wrapmargin, ' '); + col = wrapmargin; + i_ += wrapmargin; + continue; + } + + ++col; + ++i_; + continue; + } + + if (string_.at(i_) == ' ') { + std::string::size_type j_ = i_ + 1; + + for (; i_ != 0; --i_) { + if (string_.at(i_ - 1) != ' ') + break; + } + + for (;; ++j_) { + if (j_ == string_.size()) { + string_.erase(i_, j_ - i_); + return string_; + } + + if (string_.at(j_) != ' ') + break; + } + + linebreak_(string_, col, wrapmargin, i_, j_); + continue; + } + + std::string::size_type j_ = i_; + + for (; j_ != 0; --j_) { + if (string_.at(j_ - 1) == ' ') + break; + } + + for (i_ = j_; i_ != 0; --i_) { + if (string_.at(i_ - 1) != ' ') + break; + } + + linebreak_(string_, col, wrapmargin, i_, j_); + } +} + +void linebreak::linebreak_(std::string &string_, std::string::size_type &col, + const std::string::size_type &wrapmargin, + std::string::size_type &i_, + const std::string::size_type &j_) { + string_.replace(i_, j_ - i_, '\n' + std::string(wrapmargin, ' ')); + col = wrapmargin; + i_ += 1 /* '\n' */ + wrapmargin /* std::string(wrapmargin, ' ') */; +} +} Index: branches/apertium-tagger/apertium2/apertium/linebreak.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/linebreak.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/linebreak.h (revision 69632) @@ -0,0 +1,36 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef LINEBREAK_H +#define LINEBREAK_H + +#include + +namespace Apertium { +class linebreak { +public: + static std::string linebreak_(std::string string_, + std::string::size_type col, + const std::string::size_type &wrapmargin); + +private: + static void linebreak_(std::string &string_, std::string::size_type &col, + const std::string::size_type &wrapmargin, + std::string::size_type &i_, + const std::string::size_type &j_); +}; +} + +#endif // LINEBREAK_H Index: branches/apertium-tagger/apertium2/apertium/morpheme.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/morpheme.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/morpheme.cc (revision 69632) @@ -0,0 +1,57 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "morpheme.h" + +#include "exception.h" +#include "tag.h" + +#include +#include + +namespace Apertium { +bool operator==(const Morpheme &a, const Morpheme &b) { + return a.TheLemma == b.TheLemma && a.TheTags == b.TheTags; +} + +bool operator<(const Morpheme &a, const Morpheme &b) { + if (a.TheLemma != b.TheLemma) + return a.TheLemma < b.TheLemma; + + return a.TheTags < b.TheTags; +} + +Morpheme::operator std::wstring() const { + if (TheTags.empty()) + throw Exception::Morpheme::TheTags_empty("can't convert Morpheme " + "comprising empty Tag std::vector " + "to std::wstring"); + + if (TheLemma.empty()) + throw Exception::Morpheme::TheLemma_empty("can't convert Morpheme " + "comprising empty TheLemma " + "std::wstring to std::wstring"); + + std::wstring wstring_ = TheLemma; + + for (std::vector::const_iterator Tag_ = TheTags.begin(); + // Call .end() each iteration to save memory. + Tag_ != TheTags.end(); ++Tag_) { + wstring_ += static_cast(*Tag_); + } + + return wstring_; +} +} Index: branches/apertium-tagger/apertium2/apertium/morpheme.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/morpheme.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/morpheme.h (revision 69632) @@ -0,0 +1,35 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef MORPHEME_H +#define MORPHEME_H + +#include "tag.h" + +#include +#include + +namespace Apertium { +class Morpheme { +public: + friend bool operator==(const Morpheme &a, const Morpheme &b); + friend bool operator<(const Morpheme &a, const Morpheme &b); + operator std::wstring() const; + std::wstring TheLemma; + std::vector TheTags; +}; +} + +#endif // MORPHEME_H Index: branches/apertium-tagger/apertium2/apertium/optional.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/optional.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/optional.h (revision 69632) @@ -0,0 +1,123 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef OPTIONAL_H +#define OPTIONAL_H + +#include "exception.h" + +#include +#include +#include +#include + +namespace Apertium { +template class Optional; + +template +void swap(Optional &A, Optional &B); + +template class Optional { +public: + friend void swap(Optional &A, Optional &B); + Optional(); + Optional(const OptionalType &OptionalType_); + Optional(const Optional &Optional_); + Optional &operator=(Optional Optional_); + ~Optional(); + const OptionalType &operator*() const; + OptionalType &operator*(); + const OptionalType *operator->() const; + OptionalType *operator->(); + operator bool() const; + +private: + OptionalType *TheOptionalTypePointer; +}; + +template +void swap(Optional &A, Optional &B) { + using std::swap; + swap(A.TheOptionalTypePointer, B.TheOptionalTypePointer); +} + +template +Optional::Optional() + : TheOptionalTypePointer(NULL) {} + +template +Optional::Optional(const OptionalType &OptionalType_) + : TheOptionalTypePointer(new OptionalType(OptionalType_)) {} + +template +Optional::Optional(const Optional &Optional_) { + if (Optional_.TheOptionalTypePointer == NULL) { + TheOptionalTypePointer = NULL; + return; + } + + TheOptionalTypePointer = + new OptionalType(*(Optional_.TheOptionalTypePointer)); +} + +template +Optional &Optional::operator=(Optional Optional_) { + swap(*this, Optional_); + return *this; +} + +template Optional::~Optional() { + if (TheOptionalTypePointer == NULL) + return; + + delete TheOptionalTypePointer; +} + +template +const OptionalType &Optional::operator*() const { + if (TheOptionalTypePointer == NULL) + throw Exception::Optional::TheOptionalTypePointer_null( + "can't dereference Optional comprising null OptionalType pointer"); + + return *TheOptionalTypePointer; +} + +template +OptionalType &Optional::operator*() { + return const_cast( + static_cast(*this).operator*()); +} + +template +const OptionalType *Optional::operator->() const { + if (TheOptionalTypePointer == NULL) + throw Exception::Optional::TheOptionalTypePointer_null( + "can't dereference Optional comprising null OptionalType pointer"); + + return TheOptionalTypePointer; +} + +template +OptionalType *Optional::operator->() { + return const_cast( + static_cast(*this).operator->()); +} + +template Optional::operator bool() const { + return TheOptionalTypePointer != NULL; +} +} + +#endif Index: branches/apertium-tagger/apertium2/apertium/stream.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream.cc (revision 69632) @@ -0,0 +1,774 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream.h" + +#include "analysis.h" +#include "basic_tagger.h" +#include "streamed_type.h" +#include "wchar_t_exception.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +Stream::Stream(const basic_Tagger::Flags &Flags_) + : TheCharacterStream(std::wcin), TheFilename(), TheLineNumber(1), TheLine(), + TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} + +Stream::Stream(const basic_Tagger::Flags &Flags_, + std::wifstream &CharacterStream_, const char *const Filename_) + : TheCharacterStream(CharacterStream_), TheFilename(Filename_), + TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false), + ThePreviousCase() {} + +Stream::Stream(const basic_Tagger::Flags &Flags_, + std::wifstream &CharacterStream_, const std::string &Filename_) + : TheCharacterStream(CharacterStream_), TheFilename(Filename_), + TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false), + ThePreviousCase() {} + +Stream::Stream(const basic_Tagger::Flags &Flags_, + std::wifstream &CharacterStream_, + const std::stringstream &Filename_) + : TheCharacterStream(CharacterStream_), TheFilename(Filename_.str()), + TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false), + ThePreviousCase() {} + +StreamedType Stream::get() { + StreamedType TheStreamedType; + std::wstring Lemma; + private_flush_ = false; + + if (!is_eof_throw_if_not_TheCharacterStream_good()) { + while (true) { + const wchar_t Character_ = TheCharacterStream.get(); + + if (is_eof_throw_if_not_TheCharacterStream_good(TheStreamedType, Lemma, + Character_)) + break; + + TheLine.push_back(Character_); + + switch (Character_) { + case L'\\': // <\> 92, Hex 5c, Octal 134 + case_0x5c(TheStreamedType, Lemma, Character_); + continue; + case L'[': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '[' expected to follow ']' or '$'"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + push_back_Character(TheStreamedType, Lemma, Character_); + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L']': + if (!ThePreviousCase) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"', ']' expected to follow '['"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + ThePreviousCase = PreviousCaseType(Character_); + continue; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', ']' expected to follow '['"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + std::abort(); + case L'^': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '^' expected to follow '[', ']', or '$'"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + TheStreamedType.TheLexicalUnit = LexicalUnit(); + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L'/': + if (!ThePreviousCase) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"', '/' expected to follow '[', to follow '>' immediately, " + L"or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'^': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '/' expected to follow '[', to follow '>' " + L"immediately, or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + ThePreviousCase = PreviousCaseType(Character_); + + { + const wchar_t Character_ = TheCharacterStream.get(); + + if (is_eof_throw_if_not_TheCharacterStream_good( + TheStreamedType, Lemma, Character_)) { + std::wstringstream Message; + Message << L"unexpected end-of-file following '" + << ThePreviousCase->ThePreviousCase + << "', end-of-file expected to follow ']' or '$'"; + throw wchar_t_Exception::Stream::UnexpectedEndOfFile( + Message_what(Message)); + } + + TheLine.push_back(Character_); + + switch (Character_) { + case L'\\': + TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.push_back(Morpheme()); + case_0x5c(TheStreamedType, Lemma, Character_); + continue; + case L'*': + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L'\n': { + std::wstringstream Message; + Message << L"unexpected newline following '" + << ThePreviousCase->ThePreviousCase + << "', newline expected to follow '[', ']', or '$'"; + throw wchar_t_Exception::Stream::UnexpectedCharacter( + Message_what(Message)); + }; + case L'[': + case L']': + case L'^': + case L'#': + case L'<': + case L'>': + case L'+': + case L'$': { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase << L"', expected '*'"; + throw wchar_t_Exception::Stream::UnexpectedPreviousCase( + Message_what(Message)); + } + default: + TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.push_back(Morpheme()); + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + } + } + + continue; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '/' expected to follow '[', to follow '>' " + L"immediately, or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'#': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '/' expected to follow '[', to follow '>' " + L"immediately, or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '/' expected to follow '[', to follow '>' " + L"immediately, or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.push_back(Morpheme()); + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L'*': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '*' expected to follow '[', ']', or '$' or to follow " + L"'/' immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'<': + if (!ThePreviousCase) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"', '<' expected to follow '[', to follow '>' immediately, " + L"or to follow '#', '/' or '+' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'/': + case L'#': + case L'+': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '<' expected to follow '[', to follow '>' immediately, " + L"or to follow '#', '/' or '+' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '<' expected to follow '[', to follow '>' immediately, " + L"or to follow '#', '/' or '+' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '<' expected to follow '[', to follow '>' immediately, " + L"or to follow '#', '/' or '+' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheTags.push_back(Tag()); + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L'>': + if (!ThePreviousCase) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"', '>' expected to " + L"follow '[' or to follow " + L"'<' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'<': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '>' expected to " + L"follow '[' or to follow " + L"'<' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + ThePreviousCase = PreviousCaseType(Character_); + continue; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '>' expected to " + L"follow '[' or to follow " + L"'<' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + std::abort(); + case L'#': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + case L']': + case L'$': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'/': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '#' expected to follow '[', ']', or '$', to follow " + L"'>' immediately, or to follow '/' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '#' expected to follow '[', ']', or '$', to follow " + L"'>' immediately, or to follow '/' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '#' expected to follow '[', ']', or '$', to follow " + L"'>' immediately, or to follow '/' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + ThePreviousCase = PreviousCaseType(Character_); + continue; + } + + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'+': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + case L']': + case L'$': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '+' expected to follow '[', ']', or '$', to follow " + L"'>' " + L"immediately, or to follow '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'#': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '+' expected to follow '[', ']', or '$', to follow " + L"'>' " + L"immediately, or to follow '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: { + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '+' expected to follow '[', ']', or '$', to follow " + L"'>' immediately, or to follow '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.push_back(Morpheme()); + ThePreviousCase = PreviousCaseType(Character_); + continue; + } + + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'$': + if (!ThePreviousCase) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'*': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + if (TheFlags.getDebug()) { + if (Lemma != TheStreamedType.TheLexicalUnit->TheSurfaceForm) + std::wcerr << L"unexpected lemma \"" << Lemma + << L"\", expected \"" + << TheStreamedType.TheLexicalUnit->TheSurfaceForm + << L"\"\n"; + } + + ThePreviousCase = PreviousCaseType(Character_); + return TheStreamedType; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'#': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + ThePreviousCase = PreviousCaseType(Character_); + return TheStreamedType; + case L'\n': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message << L"unexpected newline following '" + << ThePreviousCase->ThePreviousCase + << L"', newline expected to follow '[', ']', or '$'"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + push_back_Character(TheStreamedType, Lemma, Character_); + ++TheLineNumber; + TheLine.clear(); + continue; + default: + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + } + + std::abort(); + } + } + + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message << L"unexpected end-of-file following '" + << ThePreviousCase->ThePreviousCase + << L"', end-of-file expected to follow ']' " + L"or '$'"; + throw wchar_t_Exception::Stream::UnexpectedEndOfFile( + Message_what(Message)); + } + } + + return TheStreamedType; +} + +bool Stream::flush_() const { return private_flush_; } + +Stream::PreviousCaseType::PreviousCaseType(const wchar_t &PreviousCase_) + : ThePreviousCase(PreviousCase_), isPreviousCharacter(true) {} + +bool Stream::is_eof_throw_if_not_TheCharacterStream_good() const { + if (TheCharacterStream.eof()) + return true; + + if (!TheCharacterStream) { + std::wstringstream Message; + Message << L"can't get const wchar_t: TheCharacterStream not good"; + throw wchar_t_Exception::Stream::TheCharacterStream_not_good( + Message_what(Message)); + } + + return false; +} + +std::wstring Stream::Message_what(const std::wstringstream &Message) const { + std::wstringstream what_; + + if (TheFilename) + what_ << std::wstring(TheFilename->begin(), TheFilename->end()) << L": "; + + what_ << TheLineNumber << L":" << TheLine.size() << L": " << Message.str() + << L'\n' << TheLine << L'\n' << std::wstring(TheLine.size() - 1, L' ') + << L'^'; + return what_.str(); +} + +bool +Stream::is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, + std::wstring &Lemma, + const wchar_t &Character_) { + if (isTheCharacterStream_eof(StreamedType_, Lemma, Character_)) + return true; + + if (!TheCharacterStream) { + std::wstringstream Message; + Message << L"can't get const wchar_t: TheCharacterStream not good"; + throw wchar_t_Exception::Stream::TheCharacterStream_not_good( + Message_what(Message)); + } + + return false; +} + +bool Stream::isTheCharacterStream_eof(StreamedType &StreamedType_, + std::wstring &Lemma, + const wchar_t &Character_) { + if (TheCharacterStream.eof()) + return true; + + if (TheFlags.getNullFlush()) { + if (Character_ == L'\0') { + push_back_Character(StreamedType_, Lemma, Character_); + private_flush_ = true; + return true; + } + } + + return false; +} + +void Stream::push_back_Character(StreamedType &StreamedType_, + std::wstring &Lemma, + const wchar_t &Character_) { + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + StreamedType_.TheString += Character_; + break; + case L']': + StreamedType_.TheString += Character_; + break; + case L'^': + StreamedType_.TheLexicalUnit->TheSurfaceForm += Character_; + break; + case L'/': + StreamedType_.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheLemma.push_back(Character_); + break; + case L'*': + Lemma += Character_; + break; + case L'<': + StreamedType_.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheTags.back() + .TheTag += Character_; + break; + case L'>': { + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' immediately following '" + << ThePreviousCase->ThePreviousCase << L"'"; + throw wchar_t_Exception::Stream::UnexpectedCharacter( + Message_what(Message)); + } + case L'#': + StreamedType_.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheLemma.push_back(Character_); + break; + case L'+': + StreamedType_.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheLemma.push_back(Character_); + break; + case L'$': + StreamedType_.TheString += Character_; + break; + default: + std::wstringstream Message; + Message << L"unexpected previous reserved or special character '" + << ThePreviousCase->ThePreviousCase << L"'"; + throw wchar_t_Exception::Stream::UnexpectedPreviousCase( + Message_what(Message)); + } + + ThePreviousCase->isPreviousCharacter = false; + return; + } + + StreamedType_.TheString += Character_; +} + +void Stream::case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma, + const wchar_t &Character_) { + push_back_Character(StreamedType_, Lemma, Character_); + + { + const wchar_t Character_ = TheCharacterStream.get(); + + if (is_eof_throw_if_not_TheCharacterStream_good(StreamedType_, Lemma, + Character_)) { + std::wstringstream Message; + Message << L"unexpected end-of-file following '\\', end-of-file " + L"expected to follow ']' or '$'"; + throw wchar_t_Exception::Stream::UnexpectedEndOfFile( + Message_what(Message)); + } + + TheLine.push_back(Character_); + push_back_Character(StreamedType_, Lemma, Character_); + } +} +} Index: branches/apertium-tagger/apertium2/apertium/stream.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream.h (revision 69632) @@ -0,0 +1,69 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_H +#define STREAM_H + +#include "basic_tagger.h" +#include "optional.h" +#include "streamed_type.h" + +#include +#include +#include +#include + +namespace Apertium { +class Stream { +public: + Stream(const basic_Tagger::Flags &Flags_); + Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, + const char *const Filename_); + Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, + const std::string &Filename_); + Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, + const std::stringstream &Filename_); + StreamedType get(); + bool flush_() const; + +private: + class PreviousCaseType { + public: + PreviousCaseType(const wchar_t &PreviousCase_); + wchar_t ThePreviousCase; + bool isPreviousCharacter : 1; + }; + bool is_eof_throw_if_not_TheCharacterStream_good() const; + std::wstring Message_what(const std::wstringstream &Message) const; + bool is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, + std::wstring &Lemma, + const wchar_t &Character_); + bool isTheCharacterStream_eof(StreamedType &StreamedType_, + std::wstring &Lemma, const wchar_t &Character_); + void push_back_Character(StreamedType &StreamedType_, std::wstring &Lemma, + const wchar_t &Character_); + void case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma, + const wchar_t &Character_); + std::wistream &TheCharacterStream; + Optional TheFilename; + std::size_t TheLineNumber; + std::wstring TheLine; + const basic_Tagger::Flags &TheFlags; + bool private_flush_ : 1; + Optional ThePreviousCase; +}; +} + +#endif // STREAM_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.cc (revision 69632) @@ -0,0 +1,68 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_1_tagger.h" + +#include "apertium_config.h" + +#include "analysis.h" +#include "deserialiser.h" +#include "lexical_unit.h" +#include "stream.h" +#include "streamed_type.h" + +#include +#include +#include +#include + +#if ENABLE_DEBUG + +#include +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +Stream_5_3_1_Tagger::Stream_5_3_1_Tagger(const Flags &Flags_) + : basic_5_3_1_Tagger(), basic_StreamTagger(Flags_) {} + +void Stream_5_3_1_Tagger::deserialise(std::istream &Serialised_basic_Tagger) { + Model = Deserialiser >::deserialise( + Serialised_basic_Tagger); +} + +long double Stream_5_3_1_Tagger::score(const Analysis &Analysis_) const { + return tokenCount_T(Analysis_); +} + +long double Stream_5_3_1_Tagger::tokenCount_T(const Analysis &Analysis_) const { + if (Model.find(Analysis_) == Model.end()) + return 1; + + return 1 + Model.find(Analysis_)->second; +} + +#if ENABLE_DEBUG + +std::wstring Stream_5_3_1_Tagger::score_DEBUG(const Analysis &Analysis_) const { + std::wstringstream score_DEBUG_; + score_DEBUG_ << tokenCount_T(Analysis_); + return score_DEBUG_.str(); +} + +#endif // ENABLE_DEBUG + +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.h (revision 69632) @@ -0,0 +1,53 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_1_TAGGER_H +#define STREAM_5_3_1_TAGGER_H + +#include "apertium_config.h" + +#include "analysis.h" +#include "basic_5_3_1_tagger.h" +#include "basic_stream_tagger.h" + +#include + +#if ENABLE_DEBUG + +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +class Stream_5_3_1_Tagger : private basic_5_3_1_Tagger, + public basic_StreamTagger { +public: + Stream_5_3_1_Tagger(const Flags &Flags_); + void deserialise(std::istream &Serialised_basic_Tagger); + +private: + long double score(const Analysis &Analysis_) const; + long double tokenCount_T(const Analysis &Analysis_) const; + +#if ENABLE_DEBUG + + std::wstring score_DEBUG(const Analysis &Analysis_) const; + +#endif // ENABLE_DEBUG + +}; +} + +#endif // STREAM_5_3_1_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.h (revision 69632) @@ -0,0 +1,41 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_1_TAGGER_TRAINER_H +#define STREAM_5_3_1_TAGGER_TRAINER_H + +#include "basic_5_3_1_tagger.h" +#include "basic_stream_tagger_trainer.h" + +#include "analysis.h" +#include "stream.h" + +#include + +namespace Apertium { +class Stream_5_3_1_TaggerTrainer : private basic_5_3_1_Tagger, + public basic_StreamTaggerTrainer { +public: + Stream_5_3_1_TaggerTrainer(const Flags &Flags_); + void serialise(std::ostream &Serialised_basic_Tagger) const; + +private: + void train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_); + void multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier); +}; +} + +#endif // STREAM_5_3_1_TAGGER_TRAINER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.cc (revision 69632) @@ -0,0 +1,104 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_2_tagger.h" + +#include "apertium_config.h" + +#include "a.h" +#include "analysis.h" +#include "deserialiser.h" +#include "lemma.h" + +#include +#include +#include + +#if ENABLE_DEBUG + +#include +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +Stream_5_3_2_Tagger::Stream_5_3_2_Tagger(const Flags &Flags_) + : basic_5_3_2_Tagger(), basic_StreamTagger(Flags_) {} + +void Stream_5_3_2_Tagger::deserialise(std::istream &Serialised_basic_Tagger) { + Model = + Deserialiser > >::deserialise( + Serialised_basic_Tagger); +} + +long double Stream_5_3_2_Tagger::score(const Analysis &Analysis_) const { + return (tokenCount_r_a(Analysis_) * tokenCount_a(Analysis_)) / + (tokenCount_a(Analysis_) + typeCount_a(Analysis_)); +} + +long double +Stream_5_3_2_Tagger::tokenCount_r_a(const Analysis &Analysis_) const { + if (Model.find(a(Analysis_)) == Model.end()) + return 1; + + if (Model.find(a(Analysis_))->second.find(Lemma(Analysis_)) == + Model.find(a(Analysis_))->second.end()) + return 1; + + return 1 + Model.find(a(Analysis_))->second.find(Lemma(Analysis_))->second; +} + +long double Stream_5_3_2_Tagger::tokenCount_a(const Analysis &Analysis_) const { + if (Model.find(a(Analysis_)) == Model.end()) + return 1; + + long double tokenCount_a_ = 1; + + for (std::map::const_iterator Lemma_ = + Model.find(a(Analysis_))->second.begin(); + Lemma_ != Model.find(a(Analysis_))->second.end(); ++Lemma_) { + tokenCount_a_ += Lemma_->second; + } + + return tokenCount_a_; +} + +long double Stream_5_3_2_Tagger::typeCount_a(const Analysis &Analysis_) const { + if (Model.find(a(Analysis_)) == Model.end()) + return 1; + + return (Model.find(a(Analysis_))->second.find(Lemma(Analysis_)) == + Model.find(a(Analysis_))->second.end() + ? 1 + : 0) + + Model.find(a(Analysis_))->second.size(); +} + +#if ENABLE_DEBUG + +std::wstring Stream_5_3_2_Tagger::score_DEBUG(const Analysis &Analysis_) const { + std::wstringstream score_DEBUG_; + + score_DEBUG_ << L"(" << tokenCount_r_a(Analysis_) << L" * " + << tokenCount_a(Analysis_) << L") /\n (" + << tokenCount_a(Analysis_) << L" + " << typeCount_a(Analysis_) + << L")"; + + return score_DEBUG_.str(); +} + +#endif // ENABLE_DEBUG + +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.h (revision 69632) @@ -0,0 +1,55 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_2_TAGGER_H +#define STREAM_5_3_2_TAGGER_H + +#include "apertium_config.h" + +#include "analysis.h" +#include "basic_5_3_2_tagger.h" +#include "basic_stream_tagger.h" + +#include + +#if ENABLE_DEBUG + +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +class Stream_5_3_2_Tagger : private basic_5_3_2_Tagger, + public basic_StreamTagger { +public: + Stream_5_3_2_Tagger(const Flags &Flags_); + void deserialise(std::istream &Serialised_basic_Tagger); + +private: + long double score(const Analysis &Analysis_) const; + long double tokenCount_r_a(const Analysis &Analysis_) const; + long double tokenCount_a(const Analysis &Analysis_) const; + long double typeCount_a(const Analysis &Analysis_) const; + +#if ENABLE_DEBUG + + std::wstring score_DEBUG(const Analysis &Analysis_) const; + +#endif // ENABLE_DEBUG + +}; +} + +#endif // STREAM_5_3_2_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.h (revision 69632) @@ -0,0 +1,38 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_2_TAGGER_TRAINER_H +#define STREAM_5_3_2_TAGGER_TRAINER_H + +#include "basic_5_3_2_tagger.h" +#include "basic_stream_tagger_trainer.h" + +#include + +namespace Apertium { +class Stream_5_3_2_TaggerTrainer : private basic_5_3_2_Tagger, + public basic_StreamTaggerTrainer { +public: + Stream_5_3_2_TaggerTrainer(const Flags &Flags_); + void serialise(std::ostream &Serialised_basic_Tagger) const; + +private: + void train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_); + void multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier); +}; +} + +#endif // STREAM_5_3_2_TAGGER_TRAINER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.cc (revision 69632) @@ -0,0 +1,223 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_3_tagger.h" + +#include "apertium_config.h" + +#include "analysis.h" +#include "deserialiser.h" +#include "i.h" +#include "lemma.h" +#include "morpheme.h" + +#include + +#if ENABLE_DEBUG + +#include +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +Stream_5_3_3_Tagger::Stream_5_3_3_Tagger(const Flags &Flags_) + : basic_StreamTagger(Flags_) {} + +void Stream_5_3_3_Tagger::deserialise(std::istream &Serialised_basic_Tagger) { + Model = Deserialiser< + std::pair >, + std::pair >, + std::map > > > >:: + deserialise(Serialised_basic_Tagger); +} + +long double Stream_5_3_3_Tagger::score(const Analysis &Analysis_) const { + long double score = tokenCount_r_i(Analysis_) * tokenCount_i(Analysis_), + score_Divisor = tokenCount_i(Analysis_) + typeCount_i(Analysis_); + + for (std::vector::const_iterator Morpheme_ = + Analysis_.TheMorphemes.begin() + 1; + Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) { + score *= tokenCount_d_i_Morpheme(Lemma(*Morpheme_), i(*(Morpheme_ - 1))) * + tokenCount_i_d_Morpheme(i(*Morpheme_), Lemma(*Morpheme_)); + score_Divisor *= + (tokenCount_i_Morpheme(i(*(Morpheme_ - 1))) + + typeCount_i_Morpheme(i(*(Morpheme_ - 1)), Lemma(*Morpheme_))) * + (tokenCount_d_Morpheme(Lemma(*Morpheme_)) + + typeCount_d_Morpheme(Lemma(*Morpheme_), i(*Morpheme_))); + } + + return score / score_Divisor; +} + +long double +Stream_5_3_3_Tagger::tokenCount_r_i(const Analysis &Analysis_) const { + if (Model.first.find(i(Analysis_)) == Model.first.end()) + return 1; + + if (Model.first.find(i(Analysis_))->second.find(Lemma(Analysis_)) == + Model.first.find(i(Analysis_))->second.end()) + return 1; + + return 1 + + Model.first.find(i(Analysis_))->second.find(Lemma(Analysis_))->second; +} + +long double Stream_5_3_3_Tagger::tokenCount_i(const Analysis &Analysis_) const { + if (Model.first.find(i(Analysis_)) == Model.first.end()) + return 1; + + long double tokenCount_i_ = 1; + + for (std::map::const_iterator Lemma_ = + Model.first.find(i(Analysis_))->second.begin(); + Lemma_ != Model.first.find(i(Analysis_))->second.end(); ++Lemma_) { + tokenCount_i_ += Lemma_->second; + } + + return tokenCount_i_; +} + +long double Stream_5_3_3_Tagger::typeCount_i(const Analysis &Analysis_) const { + if (Model.first.find(i(Analysis_)) == Model.first.end()) + return 1; + + return (Model.first.find(i(Analysis_))->second.find(Lemma(Analysis_)) == + Model.first.find(i(Analysis_))->second.end() + ? 1 + : 0) + + Model.first.find(i(Analysis_))->second.size(); +} + +long double Stream_5_3_3_Tagger::tokenCount_d_i_Morpheme(const Lemma &Lemma_, + const i &i_) const { + if (Model.second.first.find(i_) == Model.second.first.end()) + return 1; + + if (Model.second.first.find(i_)->second.find(Lemma_) == + Model.second.first.find(i_)->second.end()) + return 1; + + return 1 + Model.second.first.find(i_)->second.find(Lemma_)->second; +} + +long double +Stream_5_3_3_Tagger::tokenCount_i_d_Morpheme(const i &i_, + const Lemma &Lemma_) const { + if (Model.second.second.find(Lemma_) == Model.second.second.end()) + return 1; + + if (Model.second.second.find(Lemma_)->second.find(i_) == + Model.second.second.find(Lemma_)->second.end()) + return 1; + + return 1 + Model.second.second.find(Lemma_)->second.find(i_)->second; +} + +long double Stream_5_3_3_Tagger::tokenCount_i_Morpheme(const i &i_) const { + if (Model.second.first.find(i_) == Model.second.first.end()) + return 1; + + long double typeCount_i_Morpheme_ = 1; + + for (std::map::const_iterator Lemma_ = + Model.second.first.find(i_)->second.begin(); + Lemma_ != Model.second.first.find(i_)->second.end(); ++Lemma_) { + typeCount_i_Morpheme_ += Lemma_->second; + } + + return typeCount_i_Morpheme_; +} + +long double +Stream_5_3_3_Tagger::typeCount_i_Morpheme(const i &i_, + const Lemma &Lemma_) const { + if (Model.second.first.find(i_) == Model.second.first.end()) + return 1; + + return (Model.second.first.find(i_)->second.find(Lemma_) == + Model.second.first.find(i_)->second.end() + ? 1 + : 0) + + Model.second.first.find(i_)->second.size(); +} + +long double +Stream_5_3_3_Tagger::tokenCount_d_Morpheme(const Lemma &Lemma_) const { + if (Model.second.second.find(Lemma_) == Model.second.second.end()) + return 1; + + long double tokenCount_d_Morpheme_ = 1; + + for (std::map::const_iterator i_ = + Model.second.second.find(Lemma_)->second.begin(); + i_ != Model.second.second.find(Lemma_)->second.end(); ++i_) { + tokenCount_d_Morpheme_ += i_->second; + } + + return tokenCount_d_Morpheme_; +} + +long double Stream_5_3_3_Tagger::typeCount_d_Morpheme(const Lemma &Lemma_, + const i &i_) const { + if (Model.second.second.find(Lemma_) == Model.second.second.end()) + return 1; + + return (Model.second.second.find(Lemma_)->second.find(i_) == + Model.second.second.find(Lemma_)->second.end() + ? 1 + : 0) + + Model.second.second.find(Lemma_)->second.size(); +} + +#if ENABLE_DEBUG + +std::wstring Stream_5_3_3_Tagger::score_DEBUG(const Analysis &Analysis_) const { + std::wstringstream score_DEBUG_; + + score_DEBUG_ << L"(" << tokenCount_r_i(Analysis_) << L" * " + << tokenCount_i(Analysis_); + + for (std::vector::const_iterator Morpheme_ = + Analysis_.TheMorphemes.begin() + 1; + Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) { + score_DEBUG_ << L" * " << tokenCount_d_i_Morpheme(Lemma(*Morpheme_), + i(*(Morpheme_ - 1))) + << L" * " + << tokenCount_i_d_Morpheme(i(*Morpheme_), Lemma(*Morpheme_)); + } + + score_DEBUG_ << L") /\n [(" << tokenCount_i(Analysis_) << L" + " + << typeCount_i(Analysis_) << L")"; + + for (std::vector::const_iterator Morpheme_ = + Analysis_.TheMorphemes.begin() + 1; + Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) { + score_DEBUG_ << L" * (" << tokenCount_i_Morpheme(i(*(Morpheme_ - 1))) + << L" + " + << typeCount_i_Morpheme(i(*(Morpheme_ - 1)), Lemma(*Morpheme_)) + << L") * (" << tokenCount_d_Morpheme(Lemma(*Morpheme_)) + << L" + " + << typeCount_d_Morpheme(Lemma(*Morpheme_), i(*Morpheme_)) + << L")"; + } + + score_DEBUG_ << L"]"; + return score_DEBUG_.str(); +} + +#endif // ENABLE_DEBUG +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.h (revision 69632) @@ -0,0 +1,62 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_3_TAGGER_H +#define STREAM_5_3_3_TAGGER_H + +#include "apertium_config.h" + +#include "analysis.h" +#include "basic_5_3_3_tagger.h" +#include "basic_stream_tagger.h" +#include "i.h" +#include "lemma.h" + +#include + +#if ENABLE_DEBUG + +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +class Stream_5_3_3_Tagger : private basic_5_3_3_Tagger, + public basic_StreamTagger { +public: + Stream_5_3_3_Tagger(const Flags &Flags_); + void deserialise(std::istream &Serialised_basic_Tagger); + +private: + long double score(const Analysis &Analysis_) const; + long double tokenCount_r_i(const Analysis &Analysis_) const; + long double tokenCount_i(const Analysis &Analysis_) const; + long double typeCount_i(const Analysis &Analysis_) const; + long double tokenCount_d_i_Morpheme(const Lemma &Lemma_, const i &i_) const; + long double tokenCount_i_d_Morpheme(const i &i_, const Lemma &Lemma_) const; + long double tokenCount_i_Morpheme(const i &i_) const; + long double typeCount_i_Morpheme(const i &i_, const Lemma &Lemma_) const; + long double tokenCount_d_Morpheme(const Lemma &Lemma_) const; + long double typeCount_d_Morpheme(const Lemma &Lemma_, const i &i_) const; + +#if ENABLE_DEBUG + + std::wstring score_DEBUG(const Analysis &Analysis_) const; + +#endif // ENABLE_DEBUG +}; +} + +#endif // STREAM_5_3_3_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.h (revision 69632) @@ -0,0 +1,39 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_3_TAGGER_TRAINER_H +#define STREAM_5_3_3_TAGGER_TRAINER_H + +#include "analysis.h" +#include "basic_5_3_3_tagger.h" +#include "basic_stream_tagger_trainer.h" + +#include + +namespace Apertium { +class Stream_5_3_3_TaggerTrainer : private basic_5_3_3_Tagger, + public basic_StreamTaggerTrainer { +public: + Stream_5_3_3_TaggerTrainer(const Flags &Flags_); + void serialise(std::ostream &Serialised_basic_Tagger) const; + +private: + void train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_); + void multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier); +}; +} + +#endif // STREAM_5_3_3_TAGGER_TRAINER_H Index: branches/apertium-tagger/apertium2/apertium/streamed_type.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/streamed_type.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/streamed_type.h (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAMED_TYPE_H +#define STREAMED_TYPE_H + +#include "lexical_unit.h" +#include "optional.h" + +#include + +namespace Apertium { +class StreamedType { +public: + std::wstring TheString; + Optional TheLexicalUnit; +}; +} + +#endif // STREAMED_TYPE_H Index: branches/apertium-tagger/apertium2/apertium/tag.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tag.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tag.cc (revision 69632) @@ -0,0 +1,34 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "tag.h" + +#include "exception.h" + +#include + +namespace Apertium { +bool operator==(const Tag &a, const Tag &b) { return a.TheTag == b.TheTag; } + +bool operator<(const Tag &a, const Tag &b) { return a.TheTag < b.TheTag; } + +Tag::operator std::wstring() const { + if (TheTag.empty()) + throw Exception::Tag::TheTags_empty("can't convert Tag comprising empty " + "TheTag std::wstring to std::wstring"); + + return L"<" + TheTag + L">"; +} +} Index: branches/apertium-tagger/apertium2/apertium/tag.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tag.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tag.h (revision 69632) @@ -0,0 +1,31 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef TAG_H +#define TAG_H + +#include + +namespace Apertium { +class Tag { +public: + friend bool operator==(const Tag &a, const Tag &b); + friend bool operator<(const Tag &a, const Tag &b); + operator std::wstring() const; + std::wstring TheTag; +}; +} + +#endif // TAG_H Index: branches/apertium-tagger/apertium2/apertium/wchar_t_exception.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/wchar_t_exception.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/wchar_t_exception.h (revision 69632) @@ -0,0 +1,53 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef WCHAR_T_EXCEPTION_H +#define WCHAR_T_EXCEPTION_H + +#include "wchar_t_exception_type.h" + +#include +#include + +namespace Apertium { +namespace wchar_t_Exception { + +#define WCHAR_T_EXCEPTION(WCHAR_T_EXCEPTION_TYPE) \ + class WCHAR_T_EXCEPTION_TYPE : public ::Apertium::wchar_t_ExceptionType { \ + public: \ + WCHAR_T_EXCEPTION_TYPE(const wchar_t *wchar_t_what_) \ + : wchar_t_ExceptionType(wchar_t_what_) {} \ + WCHAR_T_EXCEPTION_TYPE(const std::wstring &wchar_t_what_) \ + : wchar_t_ExceptionType(wchar_t_what_) {} \ + WCHAR_T_EXCEPTION_TYPE(const std::wstringstream &wchar_t_what_) \ + : wchar_t_ExceptionType(wchar_t_what_) {} \ + ~WCHAR_T_EXCEPTION_TYPE() throw() {} \ + }; + +namespace Stream { +WCHAR_T_EXCEPTION(TheCharacterStream_not_good) +WCHAR_T_EXCEPTION(UnexpectedAnalysis) +WCHAR_T_EXCEPTION(UnexpectedCase) +WCHAR_T_EXCEPTION(UnexpectedCharacter) +WCHAR_T_EXCEPTION(UnexpectedEndOfFile) +WCHAR_T_EXCEPTION(UnexpectedLemma) +WCHAR_T_EXCEPTION(UnexpectedPreviousCase) +} + +#undef WCHAR_T_EXCEPTION +} +} + +#endif // WCHAR_T_EXCEPTION_H Index: branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.cc (revision 69632) @@ -0,0 +1,90 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "wchar_t_exception_type.h" + +#include "exception.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +void swap(wchar_t_ExceptionType &a, wchar_t_ExceptionType &b) { + using std::swap; + + swap(a.what_, b.what_); +} + +wchar_t_ExceptionType::wchar_t_ExceptionType(const wchar_t *wchar_t_what_) + : what_(new char[size(wchar_t_what_)]) { + constructor(wchar_t_what_); +} + +wchar_t_ExceptionType::wchar_t_ExceptionType(const std::wstring &wchar_t_what_) + : what_(new char[size(wchar_t_what_.c_str())]) { + constructor(wchar_t_what_.c_str()); +} + +wchar_t_ExceptionType::wchar_t_ExceptionType( + const std::wstringstream &wchar_t_what_) + : what_(new char[size(wchar_t_what_.str().c_str())]) { + constructor(wchar_t_what_.str().c_str()); +} + +wchar_t_ExceptionType::wchar_t_ExceptionType( + const wchar_t_ExceptionType &wchar_t_ExceptionType_) + : what_(new char[std::strlen(wchar_t_ExceptionType_.what_) + 1]) { + std::strcpy(what_, wchar_t_ExceptionType_.what_); +} + +wchar_t_ExceptionType &wchar_t_ExceptionType:: +operator=(wchar_t_ExceptionType wchar_t_ExceptionType_) { + swap(*this, wchar_t_ExceptionType_); + return *this; +} + +wchar_t_ExceptionType::~wchar_t_ExceptionType() throw() { delete[] what_; } + +const char *wchar_t_ExceptionType::what() const throw() { return what_; } + +std::size_t wchar_t_ExceptionType::size(const wchar_t *wchar_t_what_) { + std::mbstate_t ps = {0}; + errno = 0; + std::size_t size_ = std::wcsrtombs(NULL, &wchar_t_what_, 0, &ps); + + if (errno == EILSEQ) + throw Exception::wchar_t_ExceptionType::EILSEQ_( + "can't convert const wchar_t *wchar_t_what_ to char * : unexpected " + "wide character"); + + return size_ + 1; +} + +void wchar_t_ExceptionType::constructor(const wchar_t *wchar_t_what_) { + std::mbstate_t ps = {0}; + errno = 0; + std::wcsrtombs(what_, &wchar_t_what_, size(wchar_t_what_), &ps); + + if (errno == EILSEQ) + throw Exception::wchar_t_ExceptionType::EILSEQ_( + "can't convert const wchar_t *const wchar_t_what_ to char *what_: " + "unexpected wide character"); +} +} Index: branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.h (revision 69632) @@ -0,0 +1,45 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef WCHAR_T_EXCEPTION_TYPE_H +#define WCHAR_T_EXCEPTION_TYPE_H + +#include "basic_exception_type.h" + +#include +#include +#include + +namespace Apertium { +class wchar_t_ExceptionType : public basic_ExceptionType { +public: + friend void swap(wchar_t_ExceptionType &a, wchar_t_ExceptionType &b); + wchar_t_ExceptionType(const wchar_t *wchar_t_what_); + wchar_t_ExceptionType(const std::wstring &wchar_t_what_); + wchar_t_ExceptionType(const std::wstringstream &wchar_t_what_); + wchar_t_ExceptionType(const wchar_t_ExceptionType &wchar_t_ExceptionType_); + wchar_t_ExceptionType & + operator=(wchar_t_ExceptionType wchar_t_ExceptionType_); + virtual ~wchar_t_ExceptionType() throw(); + const char *what() const throw(); + +private: + static std::size_t size(const wchar_t *wchar_t_what_); + void constructor(const wchar_t *wchar_t_what_); + char *what_; +}; +} + +#endif // WCHAR_T_EXCEPTION_TYPE_H Index: branches/apertium-tagger/apertium2/apertium/apertium-desmediawiki.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-desmediawiki.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-desmediawiki.1 (revision 69632) @@ -0,0 +1,46 @@ +.TH apertium-desmediawiki 1 2009-08-30 "" "" +.SH NAME +apertium-desmediawiki \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-desmediawiki +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-desmediawiki +is a processor for mediawiki XML dumps (i.e., those produced using +Special:Export. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of a text file and produces output suitable for +processing with lt-proc. Format information (newlines, tabs, etc.) is enclosed in brackets so that lt-proc treats it as whitespace between words. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the word "gener" is analysed: +.TP +echo "gener" | apertium-destxt | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-deshtml\fR(1), +.I apertium-desrtf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Complicated links - [[page|alternative text]], [[link]]s, etc. are not +supported. +.PP +The mediawiki parser has special support for mixing apostrophes and +apostrophes as formatting. This is not supported either. +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-header.sh (revision 69632) @@ -0,0 +1,660 @@ +# -*- sh-basic-offset: 2 -*- + +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + + +message () +{ + echo "USAGE: $(basename $0) [-d datadir] [-f format] [-u] [in [out]]" + echo " -d datadir directory of linguistic data" + echo " -f format one of: txt (default), html, rtf, odt, docx, wxml, xlsx, pptx," + echo " xpresstag, html-noent, latex, latex-raw" + echo " -a display ambiguity" + echo " -u don't display marks '*' for unknown words" + echo " -n don't insert period before possible sentence-ends" + echo " -m memory.tmx use a translation memory to recycle translations" + echo " -o direction translation direction using the translation memory," + echo " by default 'direction' is used instead" + echo " -l lists the available translation directions and exits" + echo " direction typically, LANG1-LANG2, but see modes.xml in language data" + echo " in input file (stdin by default)" + echo " out output file (stdout by default)" + exit 1 +} + +list_directions () +{ + for mode in "$DATADIR"/modes/*.mode; do + echo " $(basename "${mode%%.mode}")" + done +} + +locale_utf8 () +{ + export LC_CTYPE=$(locale -a|grep -i "utf[.]*8"|head -1); + if [ LC_CTYPE = "" ]; then + echo "Error: Install an UTF-8 locale in your system"; + exit 1; + fi +} + +locale_latin1 () +{ + export LC_CTYPE=$(locale -a|grep -i -e "8859-1" -e "@euro"|head -1); + if [ LC_CTYPE = "" ]; then + echo "Error: Install a Latin-1 locale in your system"; + exit 1; + fi +} + +test_zip () +{ + if [ "$(which zip)" = "" ]; then + echo "Error: Install 'zip' command in your system"; + exit 1; + fi + + if [ "$(which unzip)" = "" ]; then + echo "Error: Install 'unzip' command in your system"; + exit 1; + fi +} + +test_gawk () +{ + GAWK=$(which gawk) + if [ "$GAWK" = "" ]; then + echo "Error: Install 'gawk' in your system" + exit 1 + fi +} + + +translate_latex() +{ + test_gawk + + if [ "$INFILE" = "" -o "$INFILE" = /dev/stdin ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + + if [ "$(file -b --mime-encoding "$INFILE")" == "utf-8" ]; then + locale_latin1 + else locale_utf8 + fi + + "$APERTIUM_PATH/apertium-prelatex" "$INFILE" | \ + "$APERTIUM_PATH/apertium-utils-fixlatex" | \ + "$APERTIUM_PATH/apertium-deslatex" ${FORMAT_OPTIONS} | \ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-relatex"| \ + awk '{gsub("", ""); print;}' | \ + if [ "$REDIR" == "" ]; then "$APERTIUM_PATH/apertium-postlatex-raw"; else "$APERTIUM_PATH/apertium-postlatex-raw" > "$SALIDA"; fi + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE" + fi +} + + +translate_latex_raw() +{ + test_gawk + + if [ "$INFILE" = "" -o "$INFILE" = /dev/stdin ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + + if [ "$(file -b --mime-encoding "$INFILE")" = "utf-8" ]; then + locale_latin1 + else locale_utf8 + fi + + "$APERTIUM_PATH/apertium-prelatex" "$INFILE" | \ + "$APERTIUM_PATH/apertium-utils-fixlatex" | \ + "$APERTIUM_PATH/apertium-deslatex" ${FORMAT_OPTIONS} | \ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-relatex"| \ + awk '{gsub("", ""); print;}' | \ + if [ "$REDIR" == "" ]; then "$APERTIUM_PATH/apertium-postlatex-raw"; else "$APERTIUM_PATH/apertium-postlatex-raw" > "$SALIDA"; fi +} + + +translate_odt () +{ + INPUT_TMPDIR=$(mktemp -d "$TMPDIR/apertium.XXXXXXXX") + + locale_utf8 + test_zip + + if [ "$INFILE" = "" ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + OTRASALIDA=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + + unzip -q -o -d "$INPUT_TMPDIR" "$INFILE" + find "$INPUT_TMPDIR" | grep "content\\.xml\\|styles\\.xml" |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + "$APERTIUM_PATH/apertium-desodt" ${FORMAT_OPTIONS} |\ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-reodt"|\ + awk '{punto = index($0, "/>") + 3; cabeza = substr($0, 1, punto-1); cola = substr($0, punto); n1 = substr(cabeza, index(cabeza, "\"")+1); name = substr(n1, 1, index(n1, "\"")-1); gsub("[?]> ", "?>\n", cola); print cola > name;}' + VUELVE=$(pwd) + cd "$INPUT_TMPDIR" + rm -Rf ObjectReplacements + zip -q -r - . >"$OTRASALIDA" + cd "$VUELVE" + rm -Rf "$INPUT_TMPDIR" + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE"; + fi + + if [ "$REDIR" == "" ]; then cat "$OTRASALIDA"; else cat "$OTRASALIDA" > "$SALIDA"; fi + rm -Rf "$OTRASALIDA" + rm -Rf "$TMCOMPFILE" +} + +translate_docx () +{ + INPUT_TMPDIR=$(mktemp -d "$TMPDIR/apertium.XXXXXXXX") + + locale_utf8 + test_zip + + if [ "$INFILE" = "" ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + OTRASALIDA=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + + if [ "$UWORDS" = "no" ]; then + OPCIONU="-u"; + else OPCIONU=""; + fi + + unzip -q -o -d "$INPUT_TMPDIR" "$INFILE" + + for i in $(find "$INPUT_TMPDIR"|grep "xlsx$"); + do LOCALTEMP=$(mktemp "$TMPDIR/apertium.XXXXXXXX"); + "$APERTIUM_PATH/apertium" -f xlsx -d "$DATADIR" "$OPCIONU" "$PAIR" <"$i" >"$LOCALTEMP"; + cp "$LOCALTEMP" "$i"; + rm "$LOCALTEMP"; + done; + + find "$INPUT_TMPDIR" | grep "xml" |\ + grep -v -i \\\(settings\\\|theme\\\|styles\\\|font\\\|rels\\\|docProps\\\) |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + "$APERTIUM_PATH/apertium-deswxml" ${FORMAT_OPTIONS} |\ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-rewxml"|\ + awk '{punto = index($0, "/>") + 3; cabeza = substr($0, 1, punto-1); cola = substr($0, punto); n1 = substr(cabeza, index(cabeza, "\"")+1); name = substr(n1, 1, index(n1, "\"")-1); gsub("[?]> ", "?>\n", cola); print cola > name;}' + VUELVE=$(pwd) + cd "$INPUT_TMPDIR" + zip -q -r - . >"$OTRASALIDA" + cd "$VUELVE" + rm -Rf "$INPUT_TMPDIR" + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE"; + fi + + if [ "$REDIR" == "" ]; then cat "$OTRASALIDA"; else cat "$OTRASALIDA" > "$SALIDA"; fi + rm -Rf "$OTRASALIDA" + rm -Rf "$TMCOMPFILE" +} + +translate_pptx () +{ + INPUT_TMPDIR=$(mktemp -d "$TMPDIR/apertium.XXXXXXXX") + + locale_utf8 + test_zip + + if [ "$INFILE" = "" ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + OTRASALIDA=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + + if [ "$UWORDS" = "no" ]; then + OPCIONU="-u"; + else OPCIONU=""; + fi + + unzip -q -o -d "$INPUT_TMPDIR" "$INFILE" + + for i in $(find "$INPUT_TMPDIR"|grep "xlsx$"); do + LOCALTEMP=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + "$APERTIUM_PATH/apertium" -f xlsx -d "$DATADIR" "$OPCIONU" "$PAIR" <"$i" >"$LOCALTEMP"; + cp "$LOCALTEMP" "$i" + rm "$LOCALTEMP" + done; + + find "$INPUT_TMPDIR" | grep "xml$" |\ + grep "slides\/slide" |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + "$APERTIUM_PATH/apertium-despptx" ${FORMAT_OPTIONS} |\ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-repptx" |\ + awk '{punto = index($0, "/>") + 3; cabeza = substr($0, 1, punto-1); cola = substr($0, punto); n1 = substr(cabeza, index(cabeza, "\"")+1); name = substr(n1, 1, index(n1, "\"")-1); gsub("[?]> ", "?>\n", cola); print cola > name;}' + VUELVE=$(pwd) + cd "$INPUT_TMPDIR" + zip -q -r - . >"$OTRASALIDA" + cd "$VUELVE" + rm -Rf "$INPUT_TMPDIR" + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE"; + fi + + if [ "$REDIR" == "" ]; then cat "$OTRASALIDA"; else cat "$OTRASALIDA" > "$SALIDA"; fi + rm -Rf "$OTRASALIDA" + rm -Rf "$TMCOMPFILE" +} + + +translate_xlsx () +{ + INPUT_TMPDIR=$(mktemp -d "$TMPDIR/apertium.XXXXXXXX") + + locale_utf8 + test_zip + + if [ "$INFILE" = "" ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + OTRASALIDA=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + + unzip -q -o -d "$INPUT_TMPDIR" "$INFILE" + find "$INPUT_TMPDIR" | grep "sharedStrings.xml" |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + "$APERTIUM_PATH/apertium-desxlsx" ${FORMAT_OPTIONS} |\ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-rexlsx" |\ + awk '{punto = index($0, "/>") + 3; cabeza = substr($0, 1, punto-1); cola = substr($0, punto); n1 = substr(cabeza, index(cabeza, "\"")+1); name = substr(n1, 1, index(n1, "\"")-1); gsub("[?]> ", "?>\n", cola); print cola > name;}' + VUELVE=$(pwd) + cd "$INPUT_TMPDIR" + zip -q -r - . >"$OTRASALIDA" + cd "$VUELVE" + rm -Rf "$INPUT_TMPDIR" + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE"; + fi + + if [ "$REDIR" == "" ]; then cat "$OTRASALIDA"; else cat "$OTRASALIDA" > "$SALIDA"; fi + rm -Rf "$OTRASALIDA" + rm -Rf "$TMCOMPFILE" +} + +translate_htmlnoent () +{ + "$APERTIUM_PATH/apertium-deshtml" ${FORMAT_OPTIONS} "$INFILE" | \ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; then + cat + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | if [ "$FORMAT" = "none" ]; then + if [ "$REDIR" == "" ]; then cat; else cat > "$SALIDA"; fi + else if [ "$REDIR" == "" ]; then "$APERTIUM_PATH/apertium-rehtml-noent"; else "$APERTIUM_PATH/apertium-rehtml-noent" > "$SALIDA"; fi + fi + + rm -Rf "$TMCOMPFILE" +} + + + + + +########################################################## +# Option and argument parsing, setting globals variables # +########################################################## +PATH="${APERTIUM_PATH}:${PATH}" +[[ -z $TMPDIR ]] && TMPDIR=/tmp +TMCOMPFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") +trap 'rm -Rf "$TMCOMPFILE"' EXIT + +# Default values, may be overridden below: +PAIR="" +INFILE="/dev/stdin" +FORMAT="txt" +DATADIR=$DEFAULT_DIRECTORY +TRANSLATION_MEMORY_DIRECTION=$PAIR +LIST_MODES_AND_EXIT=false +FORMAT_OPTIONS="" + +# Skip (but store) non-option arguments that come before options: +declare -a ARGS_PREOPT +declare -i OPTIND=1 +while [[ $OPTIND -le $# ]]; do + arg=${@:$OPTIND:1} + case $arg in + -*) break ;; + *) ARGS_PREOPT+=($arg); (( OPTIND++ )) ;; + esac +done + + +while getopts ":uahlf:d:m:o:n" opt; do + case "$opt" in + f) FORMAT=$OPTARG ;; + d) DATADIR=$OPTARG ;; + m) TRANSLATION_MEMORY_FILE=$OPTARG ;; + o) TRANSLATION_MEMORY_DIRECTION=$OPTARG ;; + u) UWORDS="no" ;; + n) FORMAT_OPTIONS="-n" ;; + a) OPTION_TAGGER="-m" ;; + l) LIST_MODES_AND_EXIT=true ;; + h) message ;; + \?) echo "ERROR: Unknown option $OPTARG"; message ;; + :) echo "ERROR: $OPTARG requires an argument"; message ;; + esac +done +shift $(($OPTIND-1)) + +if $LIST_MODES_AND_EXIT; then list_directions; exit 0; fi + +# Restore non-option arguments that came before options back into arg list: +set -- "${ARGS_PREOPT[@]}" "$@" + +case "$#" in + 3) + SALIDA=$3 + REDIR=">" + INFILE=$2 + PAIR=$1 + if [[ ! -e "$INFILE" ]]; then + echo "Error: file '$INFILE' not found." + message + fi + ;; + 2) + INFILE=$2 + PAIR=$1 + if [[ ! -e "$INFILE" ]]; then + echo "Error: file '$INFILE' not found." + message + fi + ;; + 1) + PAIR=$1 + ;; + *) + message + ;; +esac + + +if [[ -n $TRANSLATION_MEMORY_FILE ]]; then + "$APERTIUM_PATH/lt-tmxcomp" "$TRANSLATION_MEMORY_DIRECTION" "$TRANSLATION_MEMORY_FILE" "$TMCOMPFILE" >/dev/null + if [ "$?" != "0" ]; then + echo "Error: Cannot compile TM '$TRANSLATION_MEMORY_FILE'" + echo" hint: use -o parameter" + message + fi +fi + +if [[ ! -d "$DATADIR/modes" ]]; then + echo "Error: Directory '$DATADIR/modes' does not exist." + message +fi + +if [[ ! -e "$DATADIR/modes/$PAIR.mode" ]]; then + echo -n "Error: Mode $PAIR does not exist" + c=$(find "$DATADIR/modes"|wc -l) + if [ "$c" -le 1 ]; then + echo "." + else + echo ". Try one of:" + list_directions + fi + exit 1 +fi + +#Parametro opcional, de no estar, lee de la entrada estandar (stdin) + +case "$FORMAT" in + none) + if [ "$UWORDS" = "no" ]; then + OPTION="-n"; + else OPTION="-g"; + fi + ;; + txt|rtf|html|xpresstag|mediawiki) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + ;; + rtf) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + MILOCALE=$(locale -a|grep -i -v "utf\|^C$\|^POSIX$"|head -1); + if [ "$MILOCALE" = "" ]; then + echo "Error: Install a ISO-8859-1 compatible locale in your system"; + exit 1; + fi + export LC_CTYPE=$MILOCALE + ;; + + odt) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_odt + exit 0 + ;; + latex) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_latex + exit 0 + ;; + latex-raw) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_latex_raw + exit 0 + ;; + + + docx) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_docx + exit 0 + ;; + xlsx) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_xlsx + exit 0 + ;; + pptx) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_pptx + exit 0 + ;; + html-noent) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_htmlnoent + exit 0 + ;; + + wxml) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + locale_utf8 + ;; + + txtu) + FORMAT="txt"; + OPTION="-n" + ;; + htmlu) + FORMAT="html"; + OPTION="-n"; + ;; + xpresstagu) + FORMAT="xpresstag"; + OPTION="-n"; + ;; + rtfu) + FORMAT="rtf"; + OPTION="-n"; + MILOCALE=$(locale -a|grep -i -v "utf\|^C$\|^POSIX$"|head -1); + if [ "$MILOCALE" = "" ]; then + echo "Error: Install a ISO-8859-1 compatible locale in your system"; + exit 1; + fi + export LC_CTYPE=$MILOCALE + ;; + + odtu) + OPTION="-n" + translate_odt + exit 0 + ;; + + docxu) + OPTION="-n" + translate_docx + exit 0 + ;; + + xlsxu) + OPTION="-n" + translate_xlsx + exit 0 + ;; + + pptxu) + OPTION="-n" + translate_pptx + exit 0 + ;; + + wxmlu) + OPTION="-n"; + locale_utf8 + ;; + + + + *) # Por defecto asumimos txt + FORMAT="txt" + OPTION="-g" + ;; +esac + +if [ -z "$REF" ] +then + REF=$FORMAT +fi + +set -e -o pipefail + +if [ "$FORMAT" = "none" ]; then + cat "$INFILE" +else + "$APERTIUM_PATH/apertium-des$FORMAT" ${FORMAT_OPTIONS} "$INFILE" +fi | if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then + cat + else + "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE" + fi | if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else + "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | if [ "$FORMAT" = "none" ]; then + if [ "$REDIR" = "" ]; then + cat + else + cat > "$SALIDA" + fi + else + if [ "$REDIR" = "" ]; then + "$APERTIUM_PATH/apertium-re$FORMAT" + else + "$APERTIUM_PATH/apertium-re$FORMAT" > "$SALIDA" + fi + fi + Index: branches/apertium-tagger/apertium2/apertium/postchunk.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/postchunk.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/postchunk.cc (revision 69632) @@ -0,0 +1,2074 @@ +/* + * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "apertium_config.h" +#include + +using namespace Apertium; +using namespace std; + +void +Postchunk::destroy() +{ + if(me) + { + delete me; + me = NULL; + } + if(doc) + { + xmlFreeDoc(doc); + doc = NULL; + } +} + +Postchunk::Postchunk() : +word(0), +blank(0), +lword(0), +lblank(0), +output(0), +any_char(0), +any_tag(0), +nwords(0) +{ + me = NULL; + doc = NULL; + root_element = NULL; + lastrule = NULL; + inword = false; + null_flush = false; + internal_null_flush = false; +} + +Postchunk::~Postchunk() +{ + destroy(); +} + +void +Postchunk::readData(FILE *in) +{ + alphabet.read(in); + any_char = alphabet(TRXReader::ANY_CHAR); + any_tag = alphabet(TRXReader::ANY_TAG); + + Transducer t; + t.read(in, alphabet.size()); + + map finals; + + // finals + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + int key = Compression::multibyte_read(in); + finals[key] = Compression::multibyte_read(in); + } + + me = new MatchExe(t, finals); + + // attr_items + bool recompile_attrs = Compression::string_read(in) != string(pcre_version()); + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + attr_items[cad_k].read(in); + wstring fallback = Compression::wstring_read(in); + if(recompile_attrs) { + attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + } + } + + // variables + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + } + + // macros + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + macros[cad_k] = Compression::multibyte_read(in); + } + + // lists + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + + for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) + { + wstring const cad_v = Compression::wstring_read(in); + lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); + listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + } + } +} + +void +Postchunk::read(string const &transferfile, string const &datafile) +{ + readPostchunk(transferfile); + + // datafile + FILE *in = fopen(datafile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open file '" << datafile << "'." << endl; + exit(EXIT_FAILURE); + } + readData(in); + fclose(in); + +} + +void +Postchunk::readPostchunk(string const &in) +{ + doc = xmlReadFile(in.c_str(), NULL, 0); + + if(doc == NULL) + { + cerr << "Error: Could not parse file '" << in << "'." << endl; + exit(EXIT_FAILURE); + } + + root_element = xmlDocGetRootElement(doc); + + // search for macros & rules + for(xmlNode *i = root_element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) + { + collectMacros(i); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) + { + collectRules(i); + } + } + } +} + +void +Postchunk::collectRules(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + for(xmlNode *j = i->children; ; j = j->next) + { + if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "action")) + { + rule_map.push_back(j); + break; + } + } + } + } +} + +void +Postchunk::collectMacros(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + macro_map.push_back(i); + } + } +} + +bool +Postchunk::checkIndex(xmlNode *element, int index, int limit) +{ + if(index > limit) + { + wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) <line << endl; + return false; + } + return true; +} + + +string +Postchunk::evalString(xmlNode *element) +{ + map::iterator it; + it = evalStringCache.find(element); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_clip_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + } + break; + + case ti_lu_count: + return StringUtils::itoa_string(tmpword.size()); + + case ti_var: + return variables[ti.getContent()]; + + case ti_lit_tag: + case ti_lit: + return ti.getContent(); + + case ti_b: + if(checkIndex(element, ti.getPos(), lblank)) + { + if(ti.getPos() >= 0) + { + return !blank?"":*(blank[ti.getPos()]); + } + return " "; + } + break; + + case ti_get_case_from: + if(checkIndex(element, ti.getPos(), lword)) + { + return copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), + evalString((xmlNode *) ti.getPointer())); + } + break; + + case ti_case_of_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return caseOf(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); + } + break; + + default: + return ""; + } + return ""; + } + + if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *)i->children->content); + } + } + + evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) + { + evalStringCache[element] = TransferInstr(ti_lit_tag, + tags((const char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) + { + evalStringCache[element] = TransferInstr(ti_lit, string((char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) + { + if(element->properties == NULL) + { + evalStringCache[element] = TransferInstr(ti_b, " ", -1); + } + else + { + int pos = atoi((const char *) element->properties->children->content) - 1; + evalStringCache[element] = TransferInstr(ti_b, "", pos); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) + { + int pos = atoi((const char *) element->properties->children->content); + xmlNode *param = NULL; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + param = i; + break; + } + } + + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) + { + evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lu-count")) + { + evalStringCache[element] = TransferInstr(ti_lu_count, "", 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content); + } + } + + evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) + { + string value; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + value.append(evalString(i)); + } + } + return value; + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + myword.append(evalString(i)); + } + } + + if(myword != "") + { + return "^"+myword+"$"; + } + else + { + return ""; + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu")) + { + string value; + + bool first_time = true; + + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + string myword; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + + if(!first_time) + { + if(myword != "" && myword[0] != '#') //'+#' problem + { + value.append("+"); + } + } + else + { + if(myword != "") + { + first_time = false; + } + } + + value.append(myword); + } + } + + if(value != "") + { + return "^"+value+"$"; + } + else + { + return ""; + } + } + + else + { + cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + } + + return evalString(element); +} + +void +Postchunk::processOut(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + if(myword != "") + { + fputwc_unlocked(L'^', output); + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); + fputwc_unlocked(L'$', output); + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) + { + fputwc_unlocked(L'^', output); + bool first_time = true; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + string myword; + for(xmlNode *k = j->children; k != NULL; k = k->next) + { + if(k->type == XML_ELEMENT_NODE) + { + myword.append(evalString(k)); + } + } + + if(!first_time) + { + if(myword != "") + { + fputwc_unlocked('+', output); + } + } + else + { + if(myword != "") + { + first_time = false; + } + } + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); + } + } + fputwc_unlocked(L'$', output); + } + else // 'b' + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); + } + } + } +} + +void +Postchunk::processTags(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (xmlChar const *) "tag")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(j)).c_str(), output); + } + } + } + } + } +} + +void +Postchunk::processInstruction(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) + { + processChoose(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) + { + processLet(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) + { + processAppend(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) + { + processOut(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) + { + processCallMacro(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) + { + processModifyCase(localroot); + } +} + +void +Postchunk::processLet(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + map::iterator it = evalStringCache.find(leftSide); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_var: + variables[ti.getContent()] = evalString(rightSide); + return; + + case ti_clip_tl: + word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide)); + return; + + default: + return; + } + } + if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = evalString(rightSide); + evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content); + } + } + + + word[pos]->setChunkPart(attr_items[(const char *) part], + evalString(rightSide)); + evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part, + pos, NULL); + } +} + +void +Postchunk::processAppend(xmlNode *localroot) +{ + string name; + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "n")) + { + name = (char *) i->children->content; + break; + } + } + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + variables[name].append(evalString(i)); + } + } +} + +void +Postchunk::processModifyCase(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content); + } + } + + string const result = copycase(evalString(rightSide), + word[pos]->chunkPart(attr_items[(const char *) part])); + word[pos]->setChunkPart(attr_items[(const char *) part], result); + + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = copycase(evalString(rightSide), variables[val]); + } +} + +void +Postchunk::processCallMacro(xmlNode *localroot) +{ + const char *n = (const char *) localroot->properties->children->content; + int npar = 0; + + xmlNode *macro = macro_map[macros[n]]; + + for(xmlAttr *i = macro->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "npar")) + { + npar = atoi((const char *) i->children->content); + break; + } + } + + if (npar <= 0) + { + throw "Postchunk::processCallMacro() assumes npar > 0, but got npar <= 0"; + } + + InterchunkWord **myword = NULL; + if(npar > 0) + { + myword = new InterchunkWord *[npar+1]; + } + string **myblank = NULL; + if(npar > 0) + { + myblank = new string *[npar]; + } + + myword[0] = word[0]; + + int idx = 1; + int lastpos = 0; + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + int pos = atoi((const char *) i->properties->children->content); + if(!checkIndex(localroot, pos, lword)) { + pos=1; // for a rule to match, there has to be at least one word, so should be safe + } + myword[idx] = word[pos]; + if(blank) + { + myblank[idx-1] = blank[lastpos]; + } + + idx++; + lastpos = pos; + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + for(xmlNode *i = macro->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + processInstruction(i); + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + delete[] myword; + delete[] myblank; +} + +void +Postchunk::processChoose(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "when")) + { + bool picked_option = false; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(j->name, (const xmlChar *) "test")) + { + if(!processTest(j)) + { + break; + } + else + { + picked_option = true; + } + } + else + { + processInstruction(j); + } + } + } + if(picked_option) + { + return; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + processInstruction(j); + } + } + } + } + } +} + +bool +Postchunk::processLogical(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) + { + return processEqual(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) + { + return processBeginsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) + { + return processBeginsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) + { + return processEndsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) + { + return processEndsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) + { + return processContainsSubstring(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + { + return processOr(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + { + return processAnd(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) + { + return processNot(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) + { + return processIn(localroot); + } + + return false; +} + +bool +Postchunk::processIn(xmlNode *localroot) +{ + xmlNode *value = NULL; + xmlChar *idlist = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(value == NULL) + { + value = i; + } + else + { + idlist = i->properties->children->content; + break; + } + } + } + + string sval = evalString(value); + + if(localroot->properties != NULL) + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + set &myset = listslow[(const char *) idlist]; + if(myset.find(tolower(sval)) != myset.end()) + { + return true; + } + else + { + return false; + } + } + } + + set &myset = lists[(const char *) idlist]; + if(myset.find(sval) != myset.end()) + { + return true; + } + else + { + return false; + } +} + +bool +Postchunk::processTest(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return processLogical(i); + } + } + return false; +} + +bool +Postchunk::processAnd(xmlNode *localroot) +{ + bool val = true; + for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val && processLogical(i); + } + } + + return val; +} + +bool +Postchunk::processOr(xmlNode *localroot) +{ + bool val = false; + for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val || processLogical(i); + } + } + + return val; +} + +bool +Postchunk::processNot(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return !processLogical(i); + } + } + return false; +} + +bool +Postchunk::processEqual(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first) == evalString(second); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)) == tolower(evalString(second)); + } + else + { + return evalString(first) == evalString(second); + } + } +} + +bool +Postchunk::beginsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = 0; i != limit; i++) + { + if(s1[i] != s2[i]) + { + return false; + } + } + + return true; +} + +bool +Postchunk::endsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) + { + if(s1[j] != s2[i]) + { + return false; + } + } + + return true; +} + + +bool +Postchunk::processBeginsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return beginsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return beginsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return beginsWith(evalString(first), evalString(second)); + } + } +} + +bool +Postchunk::processEndsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return endsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return endsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return endsWith(evalString(first), evalString(second)); + } + } +} + +bool +Postchunk::processBeginsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(beginsWith(needle, *it)) + { + return true; + } + } + return false; +} + +bool +Postchunk::processEndsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(endsWith(needle, *it)) + { + return true; + } + } + return false; +} + + +bool +Postchunk::processContainsSubstring(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first).find(evalString(second)) != string::npos; + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; + } + else + { + return evalString(first).find(evalString(second)) != string::npos; + } + } +} + +string +Postchunk::copycase(string const &source_word, string const &target_word) +{ + wstring result; + wstring const s_word = UtfConverter::fromUtf8(source_word); + wstring const t_word = UtfConverter::fromUtf8(target_word); + + bool firstupper = iswupper(s_word[0]); + bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); + bool sizeone = s_word.size() == 1; + + if(!uppercase || (sizeone && uppercase)) + { + result = StringUtils::tolower(t_word); + } + else + { + result = StringUtils::toupper(t_word); + } + + if(firstupper) + { + result[0] = towupper(result[0]); + } + + return UtfConverter::toUtf8(result); +} + +string +Postchunk::caseOf(string const &str) +{ + wstring const s = UtfConverter::fromUtf8(str); + + if(s.size() > 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else if(!iswupper(s[s.size()-1])) + { + return "Aa"; + } + else + { + return "AA"; + } + } + else if(s.size() == 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else + { + return "Aa"; + } + } + else + { + return "aa"; + } +} + +wstring +Postchunk::caseOf(wstring const &str) +{ + if(str.size() > 1) + { + if(!iswupper(str[0])) + { + return L"aa"; + } + else if(!iswupper(str[str.size()-1])) + { + return L"Aa"; + } + else + { + return L"AA"; + } + } + else if(str.size() == 1) + { + if(!iswupper(str[0])) + { + return L"aa"; + } + else + { + return L"Aa"; + } + } + else + { + return L"aa"; + } +} + +string +Postchunk::tolower(string const &str) const +{ + return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); +} + +string +Postchunk::tags(string const &str) const +{ + string result = "<"; + + for(unsigned int i = 0, limit = str.size(); i != limit; i++) + { + if(str[i] == '.') + { + result.append("><"); + } + else + { + result += str[i]; + } + } + + result += '>'; + + return result; +} + +void +Postchunk::processRule(xmlNode *localroot) +{ + // localroot is suposed to be an 'action' tag + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + processInstruction(i); + } + } +} + +TransferToken & +Postchunk::readToken(FILE *in) +{ + if(!input_buffer.isEmpty()) + { + return input_buffer.next(); + } + + wstring content; + while(true) + { + int val = fgetwc_unlocked(in); + if(feof(in) || (internal_null_flush && val == 0)) + { + return input_buffer.add(TransferToken(content, tt_eof)); + } + if(val == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val == L'[') + { + content += L'['; + while(true) + { + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L']') + { + content += L']'; + break; + } + else + { + content += wchar_t(val2); + } + } + } + else if(inword && val == L'{') + { + content += L'{'; + while(true) + { + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L'}') + { + int val3 = wchar_t(fgetwc_unlocked(in)); + ungetwc(val3, in); + + content += L'}'; + if(val3 == L'$') + { + break; + } + } + else + { + content += wchar_t(val2); + } + } + } + else if(inword && val == L'$') + { + inword = false; + return input_buffer.add(TransferToken(content, tt_word)); + } + else if(val == L'^') + { + inword = true; + return input_buffer.add(TransferToken(content, tt_blank)); + } + else + { + content += wchar_t(val); + } + } +} + +bool +Postchunk::getNullFlush(void) +{ + return null_flush; +} + +void +Postchunk::setNullFlush(bool null_flush) +{ + this->null_flush = null_flush; +} + +void +Postchunk::postchunk_wrapper_null_flush(FILE *in, FILE *out) +{ + null_flush = false; + internal_null_flush = true; + + while(!feof(in)) + { + postchunk(in, out); + fputwc_unlocked(L'\0', out); + int code = fflush(out); + if(code != 0) + { + wcerr << L"Could not flush output " << errno << endl; + } + } + + internal_null_flush = false; + null_flush = true; +} + +void +Postchunk::postchunk(FILE *in, FILE *out) +{ + if(getNullFlush()) + { + postchunk_wrapper_null_flush(in, out); + } + + int last = 0; + + output = out; + ms.init(me->getInitial()); + + while(true) + { + if(ms.size() == 0) + { + if(lastrule != NULL) + { + applyRule(); + input_buffer.setPos(last); + } + else + { + if(tmpword.size() != 0) + { + unchunk(*tmpword[0], output); + tmpword.clear(); + input_buffer.setPos(last); + input_buffer.next(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + else if(tmpblank.size() != 0) + { + fputws_unlocked(tmpblank[0]->c_str(), output); + tmpblank.clear(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + } + } + int val = ms.classifyFinals(me->getFinals()); + if(val != -1) + { + lastrule = rule_map[val-1]; + last = input_buffer.getPos(); + } + + TransferToken ¤t = readToken(in); + + switch(current.getType()) + { + case tt_word: + applyWord(current.getContent()); + tmpword.push_back(¤t.getContent()); + break; + + case tt_blank: + ms.step(L' '); + tmpblank.push_back(¤t.getContent()); + break; + + case tt_eof: + if(tmpword.size() != 0) + { + tmpblank.push_back(¤t.getContent()); + ms.clear(); + } + else + { + fputws_unlocked(current.getContent().c_str(), output); + return; + } + break; + + default: + cerr << "Error: Unknown input token." << endl; + return; + } + } +} + +void +Postchunk::applyRule() +{ + wstring const chunk = *tmpword[0]; + tmpword.clear(); + splitWordsAndBlanks(chunk, tmpword, tmpblank); + + word = new InterchunkWord *[tmpword.size()+1]; + lword = tmpword.size(); + word[0] = new InterchunkWord(UtfConverter::toUtf8(wordzero(chunk))); + + for(unsigned int i = 1, limit = tmpword.size()+1; i != limit; i++) + { + if(i == 1) + { + if(limit != 2) + { + blank = new string *[limit - 2]; + lblank = limit - 3; + } + else + { + blank = NULL; + lblank = 0; + } + } + else + { + blank[i-2] = new string(UtfConverter::toUtf8(*tmpblank[i-1])); + } + + word[i] = new InterchunkWord(UtfConverter::toUtf8(*tmpword[i-1])); + } + + processRule(lastrule); + lastrule = NULL; + + if(word) + { + for(unsigned int i = 0, limit = tmpword.size() + 1; i != limit; i++) + { + delete word[i]; + } + delete[] word; + } + if(blank) + { + for(unsigned int i = 0, limit = tmpword.size() - 1; i != limit; i++) + { + delete blank[i]; + } + delete[] blank; + } + word = NULL; + blank = NULL; + + for(unsigned int i = 0, limit = tmpword.size(); i != limit; i++) + { + if(i != 0) + { + delete tmpblank[i]; + } + delete tmpword[i]; + } + tmpword.clear(); + tmpblank.clear(); + ms.init(me->getInitial()); +} + +void +Postchunk::applyWord(wstring const &word_str) +{ + ms.step(L'^'); + for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) + { + switch(word_str[i]) + { + case L'\\': + i++; + ms.step(towlower(word_str[i]), any_char); + break; + + case L'<': +/* for(unsigned int j = i+1; j != limit; j++) + { + if(word_str[j] == '>') + { + int symbol = alphabet(word_str.substr(i, j-i+1)); + if(symbol) + { + ms.step(symbol, any_tag); + } + else + { + ms.step(any_tag); + } + i = j; + break; + } + } + break;*/ + + case L'{': // ignore the unmodifiable part of the chunk + ms.step(L'$'); + return; + + default: + ms.step(towlower(word_str[i]), any_char); + break; + } + } + ms.step(L'$'); +} + +vector +Postchunk::getVecTags(wstring const &chunk) +{ + vector vectags; + + for(int i = 0, limit = chunk.size(); i != limit; i++) + { + if(chunk[i] == L'\\') + { + i++; + } + else if(chunk[i] == L'<') + { + wstring mytag; + do + { + mytag += chunk[i++]; + } + while(chunk[i] != L'>'); + vectags.push_back(mytag + L'>'); + } + else if(chunk[i] == L'{') + { + break; + } + } + return vectags; +} + +int +Postchunk::beginChunk(wstring const &chunk) +{ + for(int i = 0, limit = chunk.size(); i != limit; i++) + { + if(chunk[i] == L'\\') + { + i++; + } + else if(chunk[i] == L'{') + { + return i + 1; + } + } + return chunk.size(); +} + +int +Postchunk::endChunk(wstring const &chunk) +{ + return chunk.size()-2; +} + +wstring +Postchunk::wordzero(wstring const &chunk) +{ + for(unsigned int i = 0, limit = chunk.size(); i != limit ;i++) + { + if(chunk[i] == L'\\') + { + i++; + } + else if(chunk[i] == L'{') + { + return chunk.substr(0, i); + } + } + + return L""; +} + +wstring +Postchunk::pseudolemma(wstring const &chunk) +{ + for(unsigned int i = 0, limit = chunk.size(); i != limit ;i++) + { + if(chunk[i] == L'\\') + { + i++; + } + else if(chunk[i] == L'<' || chunk[i] == L'{') + { + return chunk.substr(0, i); + } + } + + return L""; +} + +void +Postchunk::unchunk(wstring const &chunk, FILE *output) +{ + vector vectags = getVecTags(chunk); + wstring case_info = caseOf(pseudolemma(chunk)); + bool uppercase_all = false; + bool uppercase_first = false; + + if(case_info == L"AA") + { + uppercase_all = true; + } + else if(case_info == L"Aa") + { + uppercase_first = true; + } + + for(int i = beginChunk(chunk), limit = endChunk(chunk); i < limit; i++) + { + if(chunk[i] == L'\\') + { + fputwc_unlocked(L'\\', output); + fputwc_unlocked(chunk[++i], output); + } + else if(chunk[i] == L'^') + { + fputwc_unlocked(L'^', output); + while(chunk[++i] != L'$') + { + if(chunk[i] == L'\\') + { + fputwc_unlocked(L'\\', output); + fputwc_unlocked(chunk[++i], output); + } + else if(chunk[i] == L'<') + { + if(iswdigit(chunk[i+1])) + { + // replace tag + unsigned long value = wcstoul(chunk.c_str()+i+1, + NULL, 0) - 1; + //atoi(chunk.c_str()+i+1)-1; + if(vectags.size() > value) + { + fputws_unlocked(vectags[value].c_str(), output); + } + while(chunk[++i] != L'>'); + } + else + { + fputwc_unlocked(L'<', output); + while(chunk[++i] != L'>') fputwc_unlocked(chunk[i], output); + fputwc_unlocked(L'>', output); + } + } + else + { + if(uppercase_all) + { + fputwc_unlocked(towupper(chunk[i]), output); + } + else if(uppercase_first) + { + if(iswalnum(chunk[i])) + { + fputwc_unlocked(towupper(chunk[i]), output); + uppercase_first = false; + } + else + { + fputwc_unlocked(chunk[i], output); + } + } + else + { + fputwc_unlocked(chunk[i], output); + } + } + } + fputwc_unlocked(L'$', output); + } + else if(chunk[i] == L'[') + { + fputwc_unlocked(L'[', output); + while(chunk[++i] != L']') + { + if(chunk[i] == L'\\') + { + fputwc_unlocked(L'\\', output); + fputwc_unlocked(chunk[++i], output); + } + else + { + fputwc_unlocked(chunk[i], output); + } + } + fputwc_unlocked(L']', output); + } + else + { + fputwc_unlocked(chunk[i], output); + } + } +} + + +void +Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, + vector &blanks) +{ + vector vectags = getVecTags(chunk); + wstring case_info = caseOf(pseudolemma(chunk)); + bool uppercase_all = false; + bool uppercase_first = false; + bool lastblank = true; + + if(case_info == L"AA") + { + uppercase_all = true; + } + else if(case_info == L"Aa") + { + uppercase_first = true; + } + + for(int i = beginChunk(chunk), limit = endChunk(chunk); i < limit; i++) + { + if(chunk[i] == L'^') + { + if(!lastblank) + { + blanks.push_back(new wstring(L"")); + } + lastblank = false; + wstring *myword = new wstring(); + wstring &ref = *myword; + + while(chunk[++i] != L'$') + { + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else if(chunk[i] == L'<') + { + if(iswdigit(chunk[i+1])) + { + // replace tag + unsigned long value = wcstoul(chunk.c_str()+i+1, + NULL, 0) - 1; + if(vectags.size() > value) + { + ref.append(vectags[value]); + } + while(chunk[++i] != L'>'); + } + else + { + ref += L'<'; + while(chunk[++i] != L'>') ref += chunk[i]; + ref += L'>'; + } + } + else + { + if(uppercase_all) + { + ref += towupper(chunk[i]); + } + else if(uppercase_first) + { + if(iswalnum(chunk[i])) + { + ref += towupper(chunk[i]); + uppercase_first = false; + } + else + { + ref += chunk[i]; + } + } + else + { + ref += chunk[i]; + } + } + } + + words.push_back(myword); + } + else if(chunk[i] == L'[') + { + if (!(lastblank && blanks.back())) + { + blanks.push_back(new wstring()); + } + wstring &ref = *(blanks.back()); + ref += L'['; + while(chunk[++i] != L']') + { + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else + { + ref += chunk[i]; + } + } + ref += chunk[i]; + + lastblank = true; + } + else + { + if (!lastblank) + { + wstring *myblank = new wstring(L""); + blanks.push_back(myblank); + } + wstring &ref = *(blanks.back()); + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else + { + ref += chunk[i]; + } + lastblank = true; + } + } +} + Index: branches/apertium-tagger/apertium2/apertium/unlocked_cstdio.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/unlocked_cstdio.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/unlocked_cstdio.h (revision 69632) @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _APERTIUM_UNLOCKED_CSTDIO_ +#define _APERTIUM_UNLOCKED_CSTDIO_ + +#include + +#if !HAVE_DECL_FPUTS_UNLOCKED +#define fputs_unlocked fputs +#endif + +#if !HAVE_DECL_FGETC_UNLOCKED +#define fgetc_unlocked fgetc +#endif + +#if !HAVE_DECL_FPUTC_UNLOCKED +#define fputc_unlocked fputc +#endif + +#if !HAVE_DECL_FWRITE_UNLOCKED +#define fwrite_unlocked fwrite +#endif + +#if !HAVE_DECL_FREAD_UNLOCKED +#define fread_unlocked fread +#endif + +#if !HAVE_DECL_FGETWC_UNLOCKED +#define fgetwc_unlocked fgetwc +#endif + +#if !HAVE_DECL_FPUTWC_UNLOCKED +#define fputwc_unlocked fputwc +#endif + +#if !HAVE_DECL_FPUTWS_UNLOCKED +#define fputws_unlocked fputws +#endif + +#if !HAVE_MBTOWC +#include +inline int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); } +inline int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); } +#endif + +#endif Index: branches/apertium-tagger/apertium2/apertium/lextor.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/lextor.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lextor.cc (revision 69632) @@ -0,0 +1,1045 @@ +/* + * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante + * author: Felipe Sánchez-Martínez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include + +#include +#include +#include +#include + +using namespace Apertium; + + +#define PI 3.14159265358979323846264338327950288 + +bool LexTor::debug; +double LexTor::angleth; + +LexTor::LexTor() : +fstpbil(0) +{ + lextor_data=NULL; + tlmodel=NULL; +} + +LexTor::LexTor(const LexTor& lt) : +fstpbil(0) +{ + lextor_data=lt.lextor_data; + tlmodel=lt.tlmodel; +} + +LexTor::~LexTor() { +} + +void +LexTor::set_lextor_data(LexTorData* ltd) { + lextor_data=ltd; +} + +void +LexTor::set_tlmodel(LexTorData* tlm) { + tlmodel=tlm; +} + +void +LexTor::set_bildic(FSTProcessor *fstp) { + fstpbil=fstp; +} + +void +LexTor::trainwrd(wistream& is, int left, int right, double weigth_exponent) { + if (lextor_data==NULL) { + wcerr<ensure_stopwords_ok(); + + wcerr< words2workwith=lextor_data->get_words(); + set::iterator itword; + + map wordsum; + + wcerr< > context; + deque buffer; + unsigned word_index=(unsigned)left; + + unsigned buffer_max_size=(unsigned)(left+1+right); + + LexTorWord *ltword; + ltword=LexTorWord::next_word(is); + while(ltword!=NULL) { + if ((++nw%250000)==0) + wcerr<get_word_string() + <reduce(ltword->get_word_string())<reduce(ltword->get_word_string()); + + if (!lextor_data->is_stopword(reduced_word)) { + if (buffer.size()>=buffer_max_size) { + buffer.pop_front(); + } + buffer.push_back(reduced_word); + + wordsum[reduced_word]+=1.0; + + //The buffer is already full + if (buffer.size()==buffer_max_size) { + for(itword=words2workwith.begin(); itword!=words2workwith.end(); itword++) { + if (buffer[word_index]==(*itword)) { + if(debug) { + wcerr<>>>"<::iterator itws; + for(itws=wordsum.begin(); itws!=wordsum.end(); itws++) { + lextor_data->set_wordcount(itws->first,itws->second); + //if(debug) { + wcerr<first<second< > context_v; + map::iterator itm; + + while(context[*itword].size()>0) { + itm=context[*itword].begin(); + context_v.push_back(*itm); + context[*itword].erase(itm); + } + + sort(context_v.begin(), context_v.end(), comparer); + wstring w=*itword; + lextor_data->set_cooccurrence_context(w, context_v); + lextor_data->set_lexchoice_sum(w, wordsum[w]); + + //if (debug) { + wcerr<ensure_stopwords_ok(); + + wcerr< words2workwith=lextor_data->get_words(); + set::iterator itword; + + map wordsum; + map lechsum; + + wcerr< lexchoice_translation; + map > lexical_choices_of_word; + + wcerr< lexical_choices=lextor_data->get_lexical_choices(*itword); + lexical_choices_of_word[*itword]=lexical_choices; + set::iterator itlch; + for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) { + lexchoice_translation[*itlch]=tlwordmodel.reduce(bildic.biltrans(*itlch,false)); + wcerr<<*itlch< > context; + deque buffer; + + int word_index=left; + unsigned buffer_max_size=left+right+1; + + LexTorWord *ltword; + ltword=LexTorWord::next_word(is,&dic); + while(ltword!=NULL) { + if (debug) { + wcerr<get_word_string()<reduce(ltword->get_word_string()); + getchar(); + } + if ((++nw%250000)==0) + wcerr<reduce(ltword->get_word_string()); + + if (!lextor_data->is_stopword(reduced_word)) { + if (buffer.size()>=buffer_max_size) { + buffer.pop_front(); + } + buffer.push_back(*ltword); + + wordsum[reduced_word]+=1.0; + + //The buffer is already full + if (buffer.size()==buffer_max_size) { + + wstring reduced_buffer_word=lextor_data->reduce(buffer[word_index].get_word_string()); + + for(itword=words2workwith.begin(); itword!=words2workwith.end(); itword++) { + if (reduced_buffer_word==(*itword)) { + //We translate each word in the context + //Note: Words in the context can also be ambiguous (with more than one lexical choice) + //In that case the count will come from all the possible + //translations + vector > translation_buffer(buffer_max_size); + vector reduced_buffer(buffer_max_size); + + for (int i=0; i<(int)buffer_max_size; i++) { + reduced_buffer[i]=lextor_data->reduce(buffer[i].get_word_string()); + } + + if(debug) { + wcerr<>>>"<0) { + wstring tr=tlwordmodel.reduce(aux_tr); + translation_buffer[i].push_back(tr); + str_translations+=tr+L"/"; + } else { + wcerr<>>>"< lexical_choices=lexical_choices_of_word[*itword]; + set::iterator itlch; + + map > local_context; + map sumvotes_context; + + //For each lexical choice the counts from the TL are collected + for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) { + for (int i=0; i<(int)buffer_max_size; i++) { + if ((i!=word_index)&&(reduced_buffer[i]!=(*itword))) { + COUNT_DATA_TYPE target_vote=0; + + //The counts of the TL co-occurrence model are transferred to the SL. If the SL word is ambiguous + //it will have more than one translation into TL, so we need to normalize using the frequency of words + //in the TL + vector translation_weighs(translation_buffer[i].size()); + double sum=0.0; + if (translation_buffer[i].size()>1) { + for(int j=0; j<(int)translation_buffer[i].size(); j++) { + translation_weighs[j]=tlwordmodel.get_lexchoice_sum(translation_buffer[i][j]); + sum+=translation_weighs[j]; + + //!!!!! Para hacer que no tenga en cuenta las polisemicas del contexto + ///////translation_weighs[j]=0; + //!!!!! + + if (debug) { + wcerr<0) { + aux_vote=(tlwordmodel.vote_from_word(lexchoice_translation[*itlch],translation_buffer[i][j])/ + tlwordmodel.get_wordcount(lexchoice_translation[*itlch]))*translation_weighs[j]; + if (debug) { + wcerr<0) { + wcerr<0) { + local_context[*itlch][reduced_buffer[i]]+=target_vote; + sumvotes_context[reduced_buffer[i]]+=target_vote; + } + } + } + } + + if (debug) { + wcerr< local_lexsum; + double local_lexsumsum=0.0; + for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) { + int distance=(-1)*left; + for (int i=0; i<(int)buffer_max_size; i++) { + if ((i!=word_index)&&(reduced_buffer[i]!=(*itword))) { + if (local_context[*itlch][reduced_buffer[i]]>0) { + double cc=local_context[*itlch][reduced_buffer[i]]/sumvotes_context[reduced_buffer[i]]; + double count_to_apply=cc/pow(fabs((double)distance),weigth_exponent); + context[*itlch][reduced_buffer[i]]+=count_to_apply; + if (debug) { + wcerr<0) && (local_lexsumsum>0)) + lechsum[*itlch]+=local_lexsum[*itlch]/local_lexsumsum; + if (debug) { + wcerr<::iterator itws; + for(itws=wordsum.begin(); itws!=wordsum.end(); itws++) { + lextor_data->set_wordcount(itws->first,itws->second); + //if(debug) { + wcerr<first<second< lexical_choices=lexical_choices_of_word[*itword]; + set::iterator itlch; + for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) { + PairStringCountComparer comparer; + vector > context_v; + map::iterator itm; + + while(context[*itlch].size()>0) { + itm=context[*itlch].begin(); + //wcerr<first<second<set_cooccurrence_context(lch, context_v); + //lextor_data->set_lexchoice_sum(lch, tlwordmodel.get_lexchoice_sum(lexchoice_translation[lch])); + + //wcerr<::iterator itlcs; + for(itlcs=lechsum.begin(); itlcs!=lechsum.end(); itlcs++) { + lextor_data->set_lexchoice_sum(itlcs->first,itlcs->second); + //if(debug) { + wcerr<first<second< buffer; + deque window; + + LexTorWord nullword(L"NULLWORD", &fstp); + + for(int i=0; i<(left+right+1); i++) + window.push_back(nullword); + + int retain=0; + + LexTorWord* ltword=NULL; + ltword=LexTorWord::next_word(is, &fstp); + + while(ltword) { + //wcerr<get_word_string() + //<reduce(ltword->get_word_string())<n_lexical_choices()<is_stopword(lextor_data->reduce(ltword->get_word_string()))) { + if (window.size()>=(unsigned)(left+1+right)) + window.pop_front(); + + window.push_back(*ltword); + + if (ltword->n_lexical_choices()>1) { + retain++; + if (retain>1) + buffer.push_back(*ltword); + } else { + if (retain>0) + buffer.push_back(*ltword); + else { + wcout<get_lexical_choice(-1,true); + if (lteval) + lteval->evalword(*ltword, -1, lextor_data); + } + } + + if (window[left].n_lexical_choices()>1) { + + if (debug) { + wcerr<>>>"<evalword(window[left], winner, lextor_data); + + //For debug + /* + cout<0) + cout<0) { + while ((buffer.size()>0)&&(buffer[0].n_lexical_choices()==1)) { + wcout<evalword(buffer[0], -1, lextor_data); + buffer.pop_front(); + } + if ((buffer.size()>0)&&(buffer[0].n_lexical_choices()>1)) + buffer.pop_front(); + + retain--; + } + } + } else { //It's a stopword + if (retain>0) + buffer.push_back(*ltword); + else { + wcout<get_lexical_choice(-1,true); + if (lteval) + lteval->evalword(*ltword, -1, lextor_data); + } + } + + delete ltword; + ltword=LexTorWord::next_word(is, &fstp); + } + + if (retain>0) { + for(unsigned i=left+1; i1) { + int winner=estimate_winner_lch(window, i, weigth_exponent); + + wcout<evalword(window[i], winner, lextor_data); + + //For debug + /* + cout<0) + cout<0) { + while ((buffer.size()>0)&&(buffer[0].n_lexical_choices()==1)) { + wcout<evalword(buffer[0], -1, lextor_data); + buffer.pop_front(); + } + if ((buffer.size()>0)&&(buffer[0].n_lexical_choices()>1)) + buffer.pop_front(); + + retain--; + } + + } + } + } + + //wcerr<& window, int word_index, double weigth_exponent) { + //return estimate_winner_lch_cosine(window, word_index, weigth_exponent); + return estimate_winner_lch_voting(window, word_index, weigth_exponent); + //return estimate_winner_lch_mostprob(window, word_index, weigth_exponent); + //return estimate_winner_lch_votingtl(window, word_index, weigth_exponent); + //return -1; +} + +int +LexTor::estimate_winner_lch_voting(deque& window, int word_index, double weigth_exponent) { + vector lexchoices_count(window[word_index].n_lexical_choices()); + + if (debug) { + wcerr<>>>"<reduce(window[i].get_word_string())<reduce(window[i].get_word_string())<get_lexchoice_sum(lextor_data->reduce_lexical_choice(window[word_index].get_lexical_choice(i,false))); + sum_lexchoices+=aux_lexchoice_sum; + if (debug) { + wcerr<reduce_lexical_choice(window[word_index].get_lexical_choice(i,false))<get_wordcount(lextor_data->reduce(window[word_index].get_word_string())); + if (debug) { + wcerr<reduce(window[word_index].get_word_string())<reduce_lexical_choice(window[word_index].get_lexical_choice(i,false)); + if (debug) { + wcerr<reduce(window[j].get_word_string()); + + if (lextor_data->get_wordcount(reduced_word)>0) { + vote=lextor_data->vote_from_word(reduced_lexchoice, reduced_word)/ + (((lextor_data->get_lexchoice_sum(reduced_lexchoice))/sum_lexchoices)*wordcount); + + lexchoices_count[i]+=vote/pow(fabs((double)distance),weigth_exponent); + } + + if (debug) { + wcerr<vote_from_word(reduced_lexchoice, reduced_word)<get_wordcount(reduced_word)<0) && (lexchoices_count[i]>winner_vote)) { + winner_vote=lexchoices_count[i]; + winner=i; + } + /* + else if ((lexchoices_count[i]>0) && (lexchoices_count[i]==winner_vote)) { + //Take the most probable one, the one with the highest sum + COUNT_DATA_TYPE sum_i=lextor_data->get_lexchoice_sum(lextor_data->reduce(window[word_index].get_lexical_choice(i))); + COUNT_DATA_TYPE sum_win=lextor_data->get_lexchoice_sum(lextor_data->reduce(window[word_index].get_lexical_choice(winner))); + if (sum_i>sum_win) + winner=i; + } + */ + } + + if (debug) { + wcerr<& window, int word_index, double weigth_exponent) { + int winner=-1; + double greatest_sum=-1; + for(int i=0; ireduce_lexical_choice(window[word_index].get_lexical_choice(i,false)); + double sumlch=lextor_data->get_lexchoice_sum(reduced_lexchoice); + + + if (debug) { + wcerr<greatest_sum) { + greatest_sum=sumlch; + winner=i; + } + } + + if (greatest_sum==0) + winner=-1; + + if (debug) + wcerr<& window, int word_index, double weigth_exponent) { + map vcontext; + + int distance=(-1)*(word_index); + for(int i=0; i<(int)window.size(); i++) { + if (i!=word_index) { + wstring reduced_word=lextor_data->reduce(window[i].get_word_string()); + vcontext[reduced_word]+=1.0/pow(fabs((double)distance),weigth_exponent); + } + distance++; + } + + if (debug) { + wcerr<::iterator it; + for(it=vcontext.begin(); it!=vcontext.end(); it++) + wcerr<first<second<reduce_lexical_choice(window[word_index].get_lexical_choice(i,false)); + + double aux_cosine=cosine(vcontext, reduced_lexchoice); + double aux_angle=(acos(aux_cosine)*180)/PI; + if (debug) { + wcerr<reduce(window[word_index].get_word_string())<max_cosine) { + diff_angle=abs(min_angle-aux_angle); + winner=i; + max_cosine=aux_cosine; + min_angle=aux_angle; + } + */ + } + + if (debug) { + wcerr<& window, int word_index, double weigth_exponent) { + if (tlmodel==NULL) { + wcerr< lexchoices_count(window[word_index].n_lexical_choices()); + vector > translation_window (window.size()); + vector reduced_window(window.size()); + + for (unsigned i=0; ireduce(window[i].get_word_string()); + + if(debug) { + wcerr<>>>"<reduce(window[i].translate(*fstpbil,j)); + translation_window[i].push_back(tr); + str_translations+=tr+L"/"; + } + if (debug) { + if (i==(unsigned)word_index) + wcerr<>>>"< translation_weighs(translation_window[k].size()); + double sum=0.0; + if (translation_window[k].size()>1) { + for(unsigned j=0; jget_lexchoice_sum(translation_window[k][j]); + sum+=translation_weighs[j]; + + //!!!!! Para hacer que no tenga en cuenta las + //!!!!! polisemicas del contexto + ///////translation_weighs[j]=0; + //!!!!! + //!!!!! + + if (debug) { + wcerr<vote_from_word(translation_window[word_index][i],translation_window[k][j])<get_wordcount(translation_window[k][j])<get_wordcount(translation_window[k][j])>0) { + aux_vote=(tlmodel->vote_from_word(translation_window[word_index][i],translation_window[k][j])/ + tlmodel->get_wordcount(translation_window[k][j]))*translation_weighs[j]; + } + target_vote+=aux_vote; + + if(debug) { + wcerr<0) && (lexchoices_count[i]>winner_vote)) { + winner_vote=lexchoices_count[i]; + winner=i; + } + } + + if (debug) + wcerr<& vcontext, const wstring& reduced_lexchoice) { + map::iterator itc; + + //We calculate the scalar product between vcontext and the lexchoice vector + double scalar_product=0; + for(itc=vcontext.begin(); itc!=vcontext.end(); itc++) { + scalar_product+=(itc->second)*(lextor_data->vote_from_word(reduced_lexchoice, itc->first)); + } + + //We calculate the module of vcontext, ||vcontext|| + double module_vcontext=0; + for(itc=vcontext.begin(); itc!=vcontext.end(); itc++) { + module_vcontext+=(itc->second)*(itc->second); + } + module_vcontext=sqrt(module_vcontext); + + //We get the module of the lexchoice vector, ||lexchoice vector|| + double module_lexchoice_vector=lextor_data->get_module_lexchoice_vector(reduced_lexchoice); + + if (module_vcontext==0) { + wcerr<::iterator it; + for(it=vcontext.begin(); it!=vcontext.end(); it++) + wcerr<first<second< +#include +#include +#include +#include +#include + +extern "C" { +#if !defined(__STDC__) +# define __STDC__ 1 +#endif +#include +} + +#include +#include +#include +#ifndef GENFORMAT +#include "apertium_config.h" +#endif +#include +#ifdef _WIN32 +#include +#include +#endif + +using namespace std; + +AccentsMap accentsMap(false); +wstring closesym = L""; +string memconv = ""; +//For german babel detection +bool ngermanbabel = false; + +wstring convertir(string const &multibyte, int const length) +{ + memconv.append(multibyte.c_str(), length); + int tam = memconv.size(); + wchar_t *retval = new wchar_t[tam+1]; + size_t l = mbstowcs(retval, memconv.c_str(), tam); + + if(l == ((size_t) -1)) + { + delete[] retval; + if(memconv.size() >= 4) + { + wcerr << L"Warning: wrong encoding" << endl; + } + return L""; + } + else + { + memconv = ""; + retval[l] = 0; + wstring ret = retval; + delete[] retval; + return ret; + } +} + + + + +%} + + +%option nounput +%option noyywrap +%option stack + +%x mathenv +%x readbrackets + +%% + + + + + +\\t\{..\} { //This information is lost + fputws(convertir(yytext+3,yyleng-4).c_str(),yyout); +} +\\l { + fputws(L"Å‚", yyout); +} + +\"[oOaAuUsS] { //When usepackage[ngerman]{babel} is present (not checked). + if(!ngermanbabel) + fputws(convertir(yytext,yyleng).c_str(),yyout); + else { + switch(yytext[1]){ + case 'o': fputws(L"ö", yyout); break; + case 'O': fputws(L"Ö", yyout); break; + case 'a': fputws(L"ä", yyout); break; + case 'A': fputws(L"Ä", yyout); break; + case 'u': fputws(L"ü", yyout); break; + case 'U': fputws(L"Ü", yyout); break; + case 's': fputws(L"ß", yyout); break; + case 'S': fputws(L"ß", yyout); break; + } + } +} + + + +\\[\^\"\'`]((\{\\[ij]\})|(\\[ij])) { + switch(yytext[1]){ + case '^': + if(yytext[4]=='i') + fputws(L"î", yyout); + else + fputws(L"ĵ",yyout); + break; + case '\"': + if(yytext[4]=='i') + fputws(L"ï",yyout); + else + fputws(L"j",yyout); //should actually be j with umlaut + break; + case '\'': + if(yytext[4]=='i') + fputws(L"í",yyout); + else + fputws(L"j",yyout); //should actually be j with accent + break; + case '`': + if(yytext[4]=='i') + fputws(L"ì",yyout); + else + fputws(L"k",yyout); //should actually be j with accent + break; + } +} + +\{\\oe\} { + fputws(L"Å“",yyout); +} + +\{\\OE\} { + fputws(L"Å’",yyout); +} + +\{\\ae\} { + fputws(L"æ",yyout); +} + +\{\\AE\} { + fputws(L"Æ",yyout); +} + +\{\\aa\} { + fputws(L"Ã¥",yyout); +} + +\{\\AA\} { + fputws(L"Ã…",yyout); +} + +\{\\o\} { + fputws(L"ø",yyout); +} + +\{\\O\} { + fputws(L"Ø",yyout); +} + +\{\\ss\} { + fputws(L"ß",yyout); +} + +\\#[0-9]+ { + fputws((wstring(L"")).c_str(),yyout); +} + +\\# { + fputws(L"", yyout); +} + +\\[`'\^\"H~ck=b.druv]((\{.\})|(.)) { + wstring ws = convertir(yytext,yyleng).c_str(); + + wstring result = accentsMap.get( + L""+ws.substr(1,1)+ ( + (yyleng==3)? ws.substr(2,1) : ws.substr(3,1) + )); + + if(result == L"") + { + fputws((wstring(L"<")+convertir(yytext+1,yyleng)+wstring(L"/>")).c_str(),yyout); + } + else + { + fputws(result.c_str(), yyout); + } +} + +\\\\ { + fputws(L"
",yyout); +} + +\%.* { + if(yytext[yyleng-1]=='\r') + fputws((wstring(L"")+convertir(yytext+1,yyleng-2)+wstring(L"\r")).c_str(),yyout); + else + fputws((wstring(L"")+convertir(yytext+1,yyleng-1)+wstring(L"")).c_str(),yyout); +} + +\\usepackage\[[^\]]*\] { + wstring ws = convertir(yytext+12,yyleng-13); + fputws((wstring(L"")+ws+wstring(L"")).c_str(), yyout); + if(ws.find(L"ngerman") != wstring::npos) + ngermanbabel = true; +} + +\[[^\]]*\] { + fputws((wstring(L"")+convertir(yytext+1,yyleng-2)+wstring(L"")).c_str(), yyout); +} + +\\begin[^a-zA-Z0-9_] { + BEGIN(readbrackets); + closesym = L""; +} + +\\end[^a-zA-Z0-9_] { + BEGIN(readbrackets); + closesym = L"/"; +} + + + +[ \n\r\t]*\{?[ \n\r\t]* { + wstring ws = convertir(yytext,yyleng); + int i = ws.find(L'{'); //remove it + if(i>=0) + ws = ws.substr(0,i)+ws.substr(i+1); + fputws(ws.c_str(),yyout); +} + +[a-zA-Z0-9]+\* { + fputws((wstring(L"<")+closesym+convertir(yytext,yyleng-1)+wstring(L"_STAR>")).c_str(),yyout); +} + +[a-zA-Z0-9]+ { + fputws((wstring(L"<")+closesym+convertir(yytext,yyleng)+wstring(L">")).c_str(),yyout); +} + +[ \n\r\t]*\}[ \n\r\t]* { + BEGIN(0); + wstring ws = convertir(yytext,yyleng); + int i = ws.find(L'}'); //remove it + if(i>=0) + ws = ws.substr(0,i)+ws.substr(i+1); + fputws(ws.c_str(),yyout); +} + + +\\[A-Za-z]+\* { + fputws((wstring(L"<")+convertir(yytext+1,yyleng-2)+wstring(L"_STAR/>")).c_str(),yyout); +} + +\\[A-Za-z]+ { + fputws((wstring(L"<")+convertir(yytext+1,yyleng)+wstring(L"/>")).c_str(),yyout); +} + +\\\{ { + fputws(L"", yyout); + } + +\\\{ { + fputws(L"", yyout); + } + +\\\% { + fputws(L"", yyout); + } + +\{ { + fputws(L"",yyout); +} + +\} { + fputws((wstring(L"")).c_str(),yyout); +} + +~ { + fputws(L"&NBSP;",yyout); +} + +\$\$ { + BEGIN(mathenv); + fputws(L"",yyout); +} + +\$\$ { + fputws(L"",yyout); + BEGIN(0); +} + +\$ { + BEGIN(mathenv); + fputws(L"",yyout); +} + +\$ { + fputws(L"",yyout); + BEGIN(0); +} + +\\verb[|][^|]+[|] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[!][^!]+[!] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[?][^?]+[?] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[/][^/]+[/] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[#][^#]+[#] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[+][^+]+[+] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\\( { + fputws(L"",yyout); +} + +\\\) { + fputws(L"",yyout); +} + +\\\[ { + fputws(L"",yyout); +} + +\\\] { + fputws(L"",yyout); +} + +\?` { + fputws(L"¿",yyout); +} + +!` { + fputws(L"¡",yyout); +} + +\" { + fputws(L""",yyout); +} +\' { + fputws(L"'",yyout); +} +\< { + fputws(L"<",yyout); +} +\> { + fputws(L">",yyout); +} +\\\& { + fputws(L"&",yyout); +} +\& { + fputws(L"",yyout); +} + + + + + +(.|\n|\r) { + fputws(convertir(yytext,yyleng).c_str(),yyout); +} + +(.|\n) { + fputws(convertir(yytext,yyleng).c_str(),yyout); +} + + +<> { + return 0; +} +%% + + + +void usage(string const &progname) +{ + + cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; + + cerr << "LaTeX format preprocessor " << endl; + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + size_t base = 0; + + if(argc >= 2 && !strcmp(argv[1],"-i")) + { + base++; + } + + if((argc-base) > 4) + { + usage(argv[0]); + } + + switch(argc-base) + { + case 3: + yyout = fopen(argv[2+base], "w"); + if(!yyout) + { + usage(argv[0]); + } + case 2: + yyin = fopen(argv[1+base], "r"); + if(!yyin) + { + usage(argv[0]); + } + break; + default: + break; + } + +#ifdef _WIN32 + _setmode(_fileno(yyin), _O_U8TEXT); + _setmode(_fileno(yyout), _O_U8TEXT); +#endif + // prevent warning message + yy_push_state(1); + yy_top_state(); + yy_pop_state(); + + yylex(); + + fclose(yyin); + fclose(yyout); +} Index: branches/apertium-tagger/apertium2/apertium/apertium_re.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_re.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_re.cc (revision 69632) @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include + +using namespace Apertium; +using namespace std; + +ApertiumRE::ApertiumRE() : +re(0) +{ + empty = true; +} + +ApertiumRE::~ApertiumRE() +{ + if(!empty) + { + pcre_free(re); + } + empty = true; +} + +void +ApertiumRE::read(FILE *input) +{ + unsigned int size = Compression::multibyte_read(input); + re = static_cast(pcre_malloc(size)); + if(size != fread(re, 1, size, input)) + { + wcerr << L"Error reading regexp" << endl; + exit(EXIT_FAILURE); + } + + empty = false; +} + +void +ApertiumRE::compile(string const &str) +{ + const char *error; + int erroroffset; + re = pcre_compile(str.c_str(), PCRE_DOTALL|PCRE_CASELESS|PCRE_EXTENDED|PCRE_UTF8, + &error, &erroroffset, NULL); + if(re == NULL) + { + wcerr << L"Error: pcre_compile "; + cerr << error << endl; + exit(EXIT_FAILURE); + } + + empty = false; +} + +void +ApertiumRE::write(FILE *output) const +{ + if(empty) + { + cerr << L"Error, cannot write empty regexp" << endl; + exit(EXIT_FAILURE); + } + + size_t size; + int rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size); + if(rc < 0) + { + wcerr << L"Error calling pcre_fullinfo()\n" << endl; + exit(EXIT_FAILURE); + } + + Compression::multibyte_write(size, output); + + size_t rc2 = fwrite(re, 1, size, output); + if(rc2 != size) + { + wcerr << L"Error writing precompiled regex\n" << endl; + exit(EXIT_FAILURE); + } +} + +string +ApertiumRE::match(string const &str) const +{ + if(empty) + { + return ""; + } + + int result[3]; + int workspace[4096]; +// int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); + int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); + + if(rc < 0) + { + switch(rc) + { + case PCRE_ERROR_NOMATCH: + return ""; + + default: + wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; + exit(EXIT_FAILURE); + } + } + + return str.substr(result[0], result[1]-result[0]); +} + +void +ApertiumRE::replace(string &str, string const &value) const +{ + if(empty) + { + return; + } + + int result[3]; + int workspace[4096]; + // int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); + int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); + if(rc < 0) + { + switch(rc) + { + case PCRE_ERROR_NOMATCH: + return; + + default: + wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; + exit(EXIT_FAILURE); + } + } + + string res = str.substr(0, result[0]); + res.append(value); + res.append(str.substr(result[1])); + str = res; +} Index: branches/apertium-tagger/apertium2/apertium/interchunk.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk.cc (revision 69632) @@ -0,0 +1,1603 @@ +/* + * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "apertium_config.h" +#include + +using namespace Apertium; +using namespace std; + +void +Interchunk::destroy() +{ + delete me; + me = NULL; + + if(doc) + { + xmlFreeDoc(doc); + doc = NULL; + } +} + +Interchunk::Interchunk() : +word(0), +blank(0), +lword(0), +lblank(0), +output(0), +any_char(0), +any_tag(0), +nwords(0) +{ + me = NULL; + doc = NULL; + root_element = NULL; + lastrule = NULL; + inword = false; + null_flush = false; + internal_null_flush = false; + trace = false; + emptyblank = ""; +} + +Interchunk::~Interchunk() +{ + destroy(); +} + +void +Interchunk::readData(FILE *in) +{ + alphabet.read(in); + any_char = alphabet(TRXReader::ANY_CHAR); + any_tag = alphabet(TRXReader::ANY_TAG); + + Transducer t; + t.read(in, alphabet.size()); + + map finals; + + // finals + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + int key = Compression::multibyte_read(in); + finals[key] = Compression::multibyte_read(in); + } + + me = new MatchExe(t, finals); + + // attr_items + bool recompile_attrs = Compression::string_read(in) != string(pcre_version()); + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + attr_items[cad_k].read(in); + wstring fallback = Compression::wstring_read(in); + if(recompile_attrs) { + attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + } + } + + // variables + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + } + + // macros + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + macros[cad_k] = Compression::multibyte_read(in); + } + + // lists + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + + for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) + { + wstring const cad_v = Compression::wstring_read(in); + lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); + listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + } + } +} + +void +Interchunk::read(string const &transferfile, string const &datafile) +{ + readInterchunk(transferfile); + + // datafile + FILE *in = fopen(datafile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open file '" << datafile << "'." << endl; + exit(EXIT_FAILURE); + } + readData(in); + fclose(in); + +} + +void +Interchunk::readInterchunk(string const &in) +{ + doc = xmlReadFile(in.c_str(), NULL, 0); + + if(doc == NULL) + { + cerr << "Error: Could not parse file '" << in << "'." << endl; + exit(EXIT_FAILURE); + } + + root_element = xmlDocGetRootElement(doc); + + // search for macros & rules + for(xmlNode *i = root_element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) + { + collectMacros(i); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) + { + collectRules(i); + } + } + } +} + +void +Interchunk::collectRules(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + for(xmlNode *j = i->children; ; j = j->next) + { + if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "action")) + { + rule_map.push_back(j); + break; + } + } + } + } +} + +void +Interchunk::collectMacros(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + macro_map.push_back(i); + } + } +} + +bool +Interchunk::checkIndex(xmlNode *element, int index, int limit) +{ + if(index >= limit) + { + wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) <line << endl; + return false; + } + return true; +} + + +string +Interchunk::evalString(xmlNode *element) +{ + if (element == 0) + { + throw "Interchunk::evalString() was passed a NULL element"; + } + + map::iterator it; + it = evalStringCache.find(element); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_clip_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + if(ti.getContent() == "content") // jacob's new 'part' + { + string wf = word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + return wf.substr(1, wf.length()-2); // trim away the { and } + } + else + { + return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + } + } + break; + + case ti_var: + return variables[ti.getContent()]; + + case ti_lit_tag: + case ti_lit: + return ti.getContent(); + + case ti_b: + if(checkIndex(element, ti.getPos(), lblank)) + { + if(ti.getPos() >= 0) + { + return !blank?"":*(blank[ti.getPos()]); + } + return " "; + } + break; + + case ti_get_case_from: + if(checkIndex(element, ti.getPos(), lword)) + { + return copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), + evalString((xmlNode *) ti.getPointer())); + } + break; + + case ti_case_of_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return caseOf(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); + } + break; + + default: + return ""; + } + return ""; + } + + if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *)i->children->content) - 1; + } + } + + evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) + { + evalStringCache[element] = TransferInstr(ti_lit_tag, + tags((const char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) + { + evalStringCache[element] = TransferInstr(ti_lit, ((const char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) + { + if(element->properties == NULL) + { + evalStringCache[element] = TransferInstr(ti_b, " ", -1); + } + else + { + int pos = atoi((const char *) element->properties->children->content) - 1; + evalStringCache[element] = TransferInstr(ti_b, "", pos); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) + { + int pos = atoi((const char *) element->properties->children->content) - 1; + xmlNode *param = NULL; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + param = i; + break; + } + } + + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) + { + evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + } + + evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) + { + string value; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + value.append(evalString(i)); + } + } + return value; + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "chunk")) + { + return processChunk(element); + } + else + { + cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + } + + return evalString(element); +} + +void +Interchunk::processOut(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) + { + fputws_unlocked(UtfConverter::fromUtf8(processChunk(i)).c_str(), output); + } + else // 'b' + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); + } + } + } +} + +string +Interchunk::processChunk(xmlNode *localroot) +{ + string result; + result.append("^"); + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + result.append(evalString(i)); + } + } + + result.append("$"); + return result; +} + +void +Interchunk::processInstruction(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) + { + processChoose(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) + { + processLet(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) + { + processAppend(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) + { + processOut(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) + { + processCallMacro(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) + { + processModifyCase(localroot); + } +} + +void +Interchunk::processLet(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + map::iterator it = evalStringCache.find(leftSide); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_var: + variables[ti.getContent()] = evalString(rightSide); + return; + + case ti_clip_tl: + word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide)); + return; + + default: + return; + } + } + if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = evalString(rightSide); + evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + } + + + word[pos]->setChunkPart(attr_items[(const char *) part], + evalString(rightSide)); + evalStringCache[leftSide] = TransferInstr(ti_clip_tl, + (const char *) part, + pos, NULL); + } +} + +void +Interchunk::processAppend(xmlNode *localroot) +{ + string name; + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "n")) + { + name = (char *) i->children->content; + break; + } + } + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + variables[name].append(evalString(i)); + } + } +} + +void +Interchunk::processModifyCase(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + } + + string const result = copycase(evalString(rightSide), + word[pos]->chunkPart(attr_items[(const char *) part])); + word[pos]->setChunkPart(attr_items[(const char *) part], result); + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = copycase(evalString(rightSide), variables[val]); + } +} + +void +Interchunk::processCallMacro(xmlNode *localroot) +{ + const char *n = (const char *) localroot->properties->children->content; + int npar = 0; + + xmlNode *macro = macro_map[macros[n]]; + + for(xmlAttr *i = macro->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "npar")) + { + npar = atoi((const char *) i->children->content); + break; + } + } + + // ToDo: Is it at all valid if npar <= 0 ? + + InterchunkWord **myword = NULL; + if(npar > 0) + { + myword = new InterchunkWord *[npar]; + } + string **myblank = NULL; + if(npar > 0) + { + myblank = new string *[npar]; + myblank[npar-1] = &emptyblank; + } + + int idx = 0; + int lastpos = 0; + for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + int pos = atoi((const char *) i->properties->children->content)-1; + myword[idx] = word[pos]; + if(idx-1 >= 0) + { + myblank[idx-1] = blank[lastpos]; + } + idx++; + lastpos = pos; + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + for(xmlNode *i = macro->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + processInstruction(i); + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + delete[] myword; + delete[] myblank; +} + +void +Interchunk::processChoose(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "when")) + { + bool picked_option = false; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(j->name, (const xmlChar *) "test")) + { + if(!processTest(j)) + { + break; + } + else + { + picked_option = true; + } + } + else + { + processInstruction(j); + } + } + } + if(picked_option) + { + return; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + processInstruction(j); + } + } + } + } + } +} + +bool +Interchunk::processLogical(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) + { + return processEqual(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) + { + return processBeginsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) + { + return processBeginsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) + { + return processEndsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) + { + return processEndsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) + { + return processContainsSubstring(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + { + return processOr(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + { + return processAnd(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) + { + return processNot(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) + { + return processIn(localroot); + } + + return false; +} + +bool +Interchunk::processIn(xmlNode *localroot) +{ + xmlNode *value = NULL; + xmlChar *idlist = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(value == NULL) + { + value = i; + } + else + { + idlist = i->properties->children->content; + break; + } + } + } + + string sval = evalString(value); + + if(localroot->properties != NULL) + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + set &myset = listslow[(const char *) idlist]; + if(myset.find(tolower(sval)) != myset.end()) + { + return true; + } + else + { + return false; + } + } + } + + set &myset = lists[(const char *) idlist]; + if(myset.find(sval) != myset.end()) + { + return true; + } + else + { + return false; + } +} + +bool +Interchunk::processTest(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return processLogical(i); + } + } + return false; +} + +bool +Interchunk::processAnd(xmlNode *localroot) +{ + bool val = true; + for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val && processLogical(i); + } + } + + return val; +} + +bool +Interchunk::processOr(xmlNode *localroot) +{ + bool val = false; + for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val || processLogical(i); + } + } + + return val; +} + +bool +Interchunk::processNot(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return !processLogical(i); + } + } + return false; +} + +bool +Interchunk::processEqual(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first) == evalString(second); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)) == tolower(evalString(second)); + } + else + { + return evalString(first) == evalString(second); + } + } +} + +bool +Interchunk::beginsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = 0; i != limit; i++) + { + if(s1[i] != s2[i]) + { + return false; + } + } + + return true; +} + +bool +Interchunk::endsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) + { + if(s1[j] != s2[i]) + { + return false; + } + } + + return true; +} + + +bool +Interchunk::processBeginsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return beginsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return beginsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return beginsWith(evalString(first), evalString(second)); + } + } +} + +bool +Interchunk::processEndsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return endsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return endsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return endsWith(evalString(first), evalString(second)); + } + } +} + +bool +Interchunk::processBeginsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(beginsWith(needle, *it)) + { + return true; + } + } + return false; +} + +bool +Interchunk::processEndsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set: