Index: branches/apertium-tagger/apertium2/apertium/apertium_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_tagger.cc (revision 69632) @@ -0,0 +1,737 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "apertium_tagger.h" + +#include "apertium_config.h" + +#include "align.h" +#include "basic_exception_type.h" +#include "basic_stream_tagger.h" +#include "basic_stream_tagger_trainer.h" +#include "basic_tagger.h" +#include "err_exception.h" +#include "exception.h" +#include "file_tagger.h" +#include "linebreak.h" +#include "stream_5_3_1_tagger.h" +#include "stream_5_3_1_tagger_trainer.h" +#include "stream_5_3_2_tagger.h" +#include "stream_5_3_2_tagger_trainer.h" +#include "stream_5_3_3_tagger.h" +#include "stream_5_3_3_tagger_trainer.h" +#include +#include +#include + +#include + +#include "getopt_long.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#include +#include +#endif // _MSC_VER + +namespace Apertium { +apertium_tagger::apertium_tagger(int &argc, char **&argv) + : argc(argc), argv(argv), The_val(), + + The_indexptr(), FunctionTypeTypeOption_indexptr(), + FunctionTypeOption_indexptr(), + + TheFunctionTypeType(), TheUnigramType(), TheFunctionType(), + TheFunctionTypeOptionArgument(0), TheFlags() { + try { + while (true) { + The_val = getopt_long(argc, argv, "dfgmpr:s:t:u:wz", longopts, &The_indexptr); + + if (The_val == -1) + break; + + set_indexptr(); + + switch (The_val) { + case 'd': + flagOptionCase(&basic_Tagger::Flags::getDebug, + &basic_Tagger::Flags::setDebug); + break; + case 'f': + flagOptionCase(&basic_Tagger::Flags::getFirst, + &basic_Tagger::Flags::setFirst); + break; + case 'm': + flagOptionCase(&basic_Tagger::Flags::getMark, + &basic_Tagger::Flags::setMark); + break; + case 'p': + flagOptionCase(&basic_Tagger::Flags::getShowSuperficial, + &basic_Tagger::Flags::setShowSuperficial); + break; + case 'z': + flagOptionCase(&basic_Tagger::Flags::getNullFlush, + &basic_Tagger::Flags::setNullFlush); + break; + case 'u': + functionTypeTypeOptionCase(Unigram); + + if (std::strncmp(optarg, "1", sizeof "1" - 1) == 0) { + TheUnigramType = Stream_5_3_1; + break; + } + + if (std::strncmp(optarg, "2", sizeof "2" - 1) == 0) { + TheUnigramType = Stream_5_3_2; + break; + } + + if (std::strncmp(optarg, "3", sizeof "3" - 1) == 0) { + TheUnigramType = Stream_5_3_3; + break; + } + + { + std::stringstream what_; + what_ << "invalid argument '" << optarg << "' for '--unigram'\n" + "Valid arguments are:\n" + " - '1'\n" + " - '2'\n" + " - '3'"; + throw Exception::apertium_tagger::InvalidArgument(what_); + } + break; + case 'w': + functionTypeTypeOptionCase(SlidingWindow); + break; + case 'g': + functionTypeOptionCase(Tagger); + break; + case 'r': + functionTypeOptionCase(Retrain); + getIterationsArgument(); + break; + case 's': + functionTypeOptionCase(Supervised); + getIterationsArgument(); + break; + case 't': + functionTypeOptionCase(Train); + getIterationsArgument(); + break; + case 'h': + help(); + return; + default: + throw err_Exception(); + } + } + + if (!TheFunctionType) { + help(); + return; + } + + switch (*TheFunctionType) { + case Tagger: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + g_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + switch (*TheUnigramType) { + case Stream_5_3_1: { + Stream_5_3_1_Tagger Stream_5_3_1_Tagger_(TheFlags); + g_StreamTagger(Stream_5_3_1_Tagger_); + } break; + case Stream_5_3_2: { + Stream_5_3_2_Tagger Stream_5_3_2_Tagger_(TheFlags); + g_StreamTagger(Stream_5_3_2_Tagger_); + } break; + case Stream_5_3_3: { + Stream_5_3_3_Tagger Stream_5_3_3_Tagger_(TheFlags); + g_StreamTagger(Stream_5_3_3_Tagger_); + } break; + default: + std::abort(); + } + } break; + case SlidingWindow: { + LSWPoST SlidingWindowTagger_; + g_FILE_Tagger(SlidingWindowTagger_); + } break; + default: + std::abort(); + } + + break; + case Retrain: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + r_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + std::stringstream what_; + what_ << "invalid option -- 'u'"; + throw Exception::apertium_tagger::InvalidOption(what_); + } + case SlidingWindow: { + LSWPoST SlidingWindowTagger_; + r_FILE_Tagger(SlidingWindowTagger_); + } break; + default: + std::abort(); + } + + break; + case Supervised: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + s_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + switch (*TheUnigramType) { + case Stream_5_3_1: { + Stream_5_3_1_TaggerTrainer Stream_5_3_1_TaggerTrainer_(TheFlags); + s_StreamTaggerTrainer(Stream_5_3_1_TaggerTrainer_); + } break; + case Stream_5_3_2: { + Stream_5_3_2_TaggerTrainer Stream_5_3_2_TaggerTrainer_(TheFlags); + s_StreamTaggerTrainer(Stream_5_3_2_TaggerTrainer_); + } break; + case Stream_5_3_3: { + Stream_5_3_3_TaggerTrainer Stream_5_3_3_TaggerTrainer_(TheFlags); + s_StreamTaggerTrainer(Stream_5_3_3_TaggerTrainer_); + } break; + default: + std::abort(); + } + } break; + case SlidingWindow: { + std::stringstream what_; + what_ << "invalid option -- 'w'"; + throw Exception::apertium_tagger::InvalidOption(what_); + } + default: + std::abort(); + } + + break; + case Train: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + t_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + std::stringstream what_; + what_ << "invalid option -- 'u'"; + throw Exception::apertium_tagger::InvalidOption(what_); + } + case SlidingWindow: { + LSWPoST SlidingWindowTagger_; + t_FILE_Tagger(SlidingWindowTagger_); + } break; + default: + std::abort(); + } + + break; + default: + std::abort(); + } + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::cerr << "apertium-tagger: " << basic_ExceptionType_.what() << '\n'; + throw err_Exception(); + } +} + +void apertium_tagger::help() { + + std::cerr << +"Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" +" [INPUT \\\n" +" [OUTPUT]]\n" +"\n" +" or: apertium-tagger [OPTION]... -r ITERATIONS \\\n" +" CORPUS \\\n" +" SERIALISED_TAGGER\n" +"\n" +" or: apertium-tagger [OPTION]... -s ITERATIONS \\\n" +" DICTIONARY \\\n" +" CORPUS \\\n" +" TAGGER_SPECIFICATION \\\n" +" SERIALISED_TAGGER \\\n" +" TAGGED_CORPUS \\\n" +" UNTAGGED_CORPUS\n" +"\n" +" or: apertium-tagger [OPTION]... -s 0 \\\n" +" -u MODEL \\\n" +" SERIALISED_TAGGER \\\n" +" TAGGED_CORPUS\n" +"\n" +" or: apertium-tagger [OPTION]... -t ITERATIONS \\\n" +" DICTIONARY \\\n" +" CORPUS \\\n" +" TAGGER_SPECIFICATION \\\n" +" SERIALISED_TAGGER\n" +"\n" +"\n" +"Mandatory arguments to long options are mandatory for short options too.\n" +"\n"; + + std::vector > options_description_; + options_description_.push_back(std::make_pair("-d, --debug", "with -g, print error messages about the input")); + options_description_.push_back(std::make_pair("-f, --first", "with -g, reorder each lexical unit's analyses so that the chosen one is first")); + options_description_.push_back(std::make_pair("-m, --mark", "with -g, mark disambiguated lexical units")); + options_description_.push_back(std::make_pair("-p, --show-superficial", "with -g, output each lexical unit's surface form")); + options_description_.push_back(std::make_pair("-z, --null-flush", "with -g, flush the output after getting each null character")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-u, --unigram=MODEL", "use unigram algorithm MODEL from ")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-w, --sliding-window", "use the Light Sliding Window algorithm")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-g, --tagger", "disambiguate the input")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-r, --retrain=ITERATIONS", "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations")); + options_description_.push_back(std::make_pair("-s, --supervised=ITERATIONS", "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations")); + options_description_.push_back(std::make_pair("-t, --train=ITERATIONS", "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations")); + align::align_(options_description_); + std::cerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-h, --help", "display this help and exit")); + align::align_(options_description_); +} + +std::string apertium_tagger::option_string(const int &indexptr_) { + return option_string(longopts[indexptr_]); +} + +std::string apertium_tagger::option_string(const struct option &option_) { + std::stringstream option_string_; + option_string_ << "--" << option_.name; + return option_string_.str(); +} + +void apertium_tagger::locale_global_() { + +#if defined __clang__ + + std::locale::global(std::locale("")); + +#else +#if defined __APPLE__ + + LtLocale::tryToSetLocale(); + +#else + + std::locale::global(std::locale("")); + +#endif // defined __APPLE__ +#endif // defined __clang__ +} + +const struct option apertium_tagger::longopts[] = { + {"help", no_argument, 0, 'h'}, + {"debug", no_argument, 0, 'd'}, + {"first", no_argument, 0, 'f'}, + {"mark", no_argument, 0, 'm'}, + {"show-superficial", no_argument, 0, 'p'}, + {"null-flush", no_argument, 0, 'z'}, + {"unigram", required_argument, 0, 'u'}, + {"sliding-window", no_argument, 0, 'w'}, + {"tagger", no_argument, 0, 'g'}, + {"retrain", required_argument, 0, 'r'}, + {"supervised", required_argument, 0, 's'}, + {"train", required_argument, 0, 't'}, + {0, 0, 0, 0}}; + +void apertium_tagger::set_indexptr() { + if (The_val == longopts[The_indexptr].val) + return; + + for (std::size_t longopts_Index = 0; longopts[longopts_Index].val != 0; + ++longopts_Index) { + if (The_val == longopts[longopts_Index].val) { + The_indexptr = longopts_Index; + return; + } + } +} + +void apertium_tagger::flagOptionCase( + bool (basic_Tagger::Flags::*GetFlag)() const, + void (basic_Tagger::Flags::*SetFlag)(const bool &)) { + if ((TheFlags.*GetFlag)()) { + std::stringstream what_; + what_ << "unexpected '" << option_string() << "' following '" + << option_string() << '\''; + throw Exception::apertium_tagger::UnexpectedFlagOption(what_); + } + + (TheFlags.*SetFlag)(true); +} + +std::string apertium_tagger::option_string() { + return option_string(The_indexptr); +} + +void apertium_tagger::functionTypeTypeOptionCase( + const FunctionTypeType &FunctionTypeType_) { + if (FunctionTypeTypeOption_indexptr) { + std::stringstream what_; + what_ << "unexpected '" << option_string() << "' following '" + << option_string(*FunctionTypeTypeOption_indexptr) + << '\''; + throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_); + } + + TheFunctionTypeType = FunctionTypeType_; + FunctionTypeTypeOption_indexptr = The_indexptr; +} + +void apertium_tagger::functionTypeOptionCase( + const FunctionType &FunctionType_) { + if (FunctionTypeOption_indexptr) { + std::stringstream what_; + what_ << "unexpected '" << option_string() << "' following '" + << option_string(*FunctionTypeOption_indexptr) + << '\''; + throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_); + } + + TheFunctionType = FunctionType_; + FunctionTypeOption_indexptr = The_indexptr; +} + +void apertium_tagger::getIterationsArgument() { + try { + TheFunctionTypeOptionArgument = optarg_unsigned_long(); + } catch (const ExceptionType &ExceptionType_) { + std::stringstream what_; + what_ << "invalid argument '" << optarg << "' for '" << option_string() + << '\''; + throw Exception::apertium_tagger::InvalidArgument(what_); + } +} + +unsigned long apertium_tagger::optarg_unsigned_long() const { + char *str_end; + errno = 0; + unsigned long N_0 = std::strtoul(optarg, &str_end, 10); + + if (*str_end != '\0') { + std::stringstream what_; + what_ << "can't convert char *optarg \"" << optarg << "\" to unsigned long"; + throw Exception::apertium_tagger::str_end_not_eq_NULL(what_); + } + + if (*optarg == '\0') { + std::stringstream what_; + what_ << "can't convert char *optarg of size 1 \"\" to unsigned long"; + throw Exception::apertium_tagger::optarg_eq_NULL(what_); + } + + if (errno == ERANGE) { + std::stringstream what_; + what_ << "can't convert char *optarg \"" << optarg + << "\" to unsigned long, not in unsigned long range"; + throw Exception::apertium_tagger::ERANGE_(what_); + } + + return N_0; +} + +template +static void try_open_fstream(const char *metavar, const char *filename, + T &stream) { + stream.open(filename); + if (stream.fail()) { + std::stringstream what_; + what_ << "can't open " << metavar << " file \"" << filename << "\""; + throw Exception::apertium_tagger::open_stream_fail(what_); + } +} + +static FILE *try_open_file(const char *metavar, const char *filename, + const char *flags) { + FILE *f = std::fopen(filename, flags); + if (f == NULL) { + std::stringstream what_; + what_ << "can't open " << metavar << " file \"" << filename << "\""; + throw Exception::apertium_tagger::fopen(what_); + } + return f; +} + +static inline FILE *try_open_file_utf8(const char *metavar, const char *filename, + const char *flags) { + FILE *f = try_open_file(metavar, filename, flags); +#ifdef _MSC_VER + _setmode(_fileno(f), _O_U8TEXT); +#endif // _MSC_VER + return f; +} + +static void try_close_file(const char *metavar, const char *filename, FILE *file) { + if (std::fclose(file) != 0) { + std::stringstream what_; + what_ << "can't close " << metavar << " file \"" << filename << "\""; + throw Exception::apertium_tagger::fclose(what_); + } +} + +void apertium_tagger::g_StreamTagger(basic_StreamTagger &StreamTagger_) { + locale_global_(); + + if (argc - optind < 1 || !(argc - optind < 4)) { + std::stringstream what_; + what_ << "expected 1, 2, or 3 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + std::ifstream SerialisedAnalysisFrequencies; + try_open_fstream("SERIALISED_TAGGER", argv[optind], + SerialisedAnalysisFrequencies); + + try { + StreamTagger_.deserialise(SerialisedAnalysisFrequencies); + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't deserialise SERIALISED_TAGGER file \"" << argv[optind] + << "\" Reason: " << basic_ExceptionType_.what(); + throw Exception::apertium_tagger::deserialise(what_); + } + + if (argc - optind < 2) { + Stream Input(TheFlags); + StreamTagger_.tag(Input, std::wcout); + return; + } + + std::wifstream Input_stream; + try_open_fstream("INPUT", argv[optind + 1], Input_stream); + + if (argc - optind < 3) { + Stream Input(TheFlags, Input_stream, argv[optind + 1]); + StreamTagger_.tag(Input, std::wcout); + return; + } + + std::wofstream Output_stream; + try_open_fstream("OUTPUT", argv[optind + 2], Input_stream); + + Stream Input(TheFlags, Input_stream, argv[optind + 1]); + StreamTagger_.tag(Input, Output_stream); +} + +void apertium_tagger::s_StreamTaggerTrainer( + basic_StreamTaggerTrainer &StreamTaggerTrainer_) { + locale_global_(); + + if (TheFunctionTypeOptionArgument != 0) { + std::stringstream what_; + what_ << "invalid argument '" << TheFunctionTypeOptionArgument + << "' for '--supervised'"; + throw Exception::apertium_tagger::InvalidArgument(what_); + } + + if (argc - optind < 2 || !(argc - optind < 3)) { + std::stringstream what_; + what_ << "expected 2 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + std::wifstream TaggedCorpus_stream; + try_open_fstream("TAGGED_CORPUS", argv[optind + 1], TaggedCorpus_stream); + + Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind]); + StreamTaggerTrainer_.train(TaggedCorpus); + + std::ofstream Serialised_basic_Tagger; + try_open_fstream("SERIALISED_TAGGER", argv[optind], + Serialised_basic_Tagger); + + StreamTaggerTrainer_.serialise(Serialised_basic_Tagger); +} + +void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + if (argc - optind < 1 || !(argc - optind < 4)) { + std::stringstream what_; + what_ << "expected 1, 2, or 3 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind], "rb"); + FILE_Tagger_.deserialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); + + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + TaggerWord::generate_marks = TheFlags.getMark(); + FILE_Tagger_.set_show_sf(TheFlags.getShowSuperficial()); + FILE_Tagger_.setNullFlush(TheFlags.getNullFlush()); + + if (argc - optind < 2) + FILE_Tagger_.tagger(stdin, stdout, TheFlags.getFirst()); + else { + FILE *Input = try_open_file("INPUT", argv[optind + 1], "r"); + + if (argc - optind < 3) + FILE_Tagger_.tagger(Input, stdout, TheFlags.getFirst()); + else { + FILE *Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); + FILE_Tagger_.tagger(Input, Output, TheFlags.getFirst()); + try_close_file("OUTPUT", argv[optind + 2], Output); + } + + try_close_file("INPUT", argv[optind + 1], Input); + } +} + +void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + if (argc - optind < 2 || !(argc - optind < 3)) { + std::stringstream what_; + what_ << "expected 2 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 1], "rb"); + FILE_Tagger_.deserialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 1], Serialised_FILE_Tagger); + + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + + FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind], "r"); + FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", argv[optind], Corpus); + + Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 1], "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 1], Serialised_FILE_Tagger); +} + +void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + if (argc - optind < 6 || !(argc - optind < 7)) { + std::stringstream what_; + what_ << "expected 6 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + FILE_Tagger_.deserialise(argv[optind + 2]); + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + + FILE *Dictionary = try_open_file("DICTIONARY", argv[optind], "r"); + FILE_Tagger_.read_dictionary(Dictionary); + try_close_file("DICTIONARY", argv[optind], Dictionary); + + FILE *TaggedCorpus = try_open_file_utf8("TAGGED_CORPUS", argv[optind + 4], "r"); + FILE *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", argv[optind + 5], "r"); + FILE_Tagger_.init_probabilities_from_tagged_text_(TaggedCorpus, + UntaggedCorpus); + try_close_file("TAGGED_CORPUS", argv[optind + 4], TaggedCorpus); + try_close_file("UNTAGGED_CORPUS", argv[optind + 5], UntaggedCorpus); + + FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind + 1], "r"); + FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", argv[optind + 1], UntaggedCorpus); + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 3], "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 3], UntaggedCorpus); +} + +void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + if (argc - optind < 4 || !(argc - optind < 5)) { + std::stringstream what_; + what_ << "expected 4 file arguments, got " << argc - optind; + throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); + } + + FILE_Tagger_.deserialise(argv[optind + 2]); + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + + FILE *Dictionary = try_open_file("DICTIONARY", argv[optind], "r"); + FILE_Tagger_.read_dictionary(Dictionary); + try_close_file("DICTIONARY", argv[optind], Dictionary); + + FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind + 1], "r"); + FILE_Tagger_.init_probabilities_kupiec_(Corpus); + FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", argv[optind + 1], Corpus); + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind + 3], "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind + 3], Serialised_FILE_Tagger); +} +} + +int main(int argc, char **argv) { + try { + apertium_tagger(argc, argv); + } catch (const err_Exception &err_Exception_) { + std::cerr << "Try 'apertium-tagger --help' for more information.\n"; + return 1; + } catch (...) { + throw; + } +} Index: branches/apertium-tagger/apertium2/apertium/exception.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/exception.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/exception.h (revision 69632) @@ -0,0 +1,92 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef EXCEPTION_APERTIUM_TAGGER_H +#define EXCEPTION_APERTIUM_TAGGER_H + +#include "exception_type.h" + +#include + +namespace Apertium { +namespace Exception { + +#define EXCEPTION(EXCEPTION_TYPE) \ + class EXCEPTION_TYPE : public ::Apertium::ExceptionType { \ + public: \ + EXCEPTION_TYPE(const char *const what_) : ExceptionType(what_) {} \ + EXCEPTION_TYPE(const std::string &what_) : ExceptionType(what_) {} \ + EXCEPTION_TYPE(const std::stringstream &what_) : ExceptionType(what_) {} \ + ~EXCEPTION_TYPE() throw() {} \ + }; + +namespace Analysis { +EXCEPTION(TheMorphemes_empty) +} + +namespace apertium_tagger { +EXCEPTION(deserialise) +EXCEPTION(fclose) +EXCEPTION(fopen) +EXCEPTION(open_stream_fail) +EXCEPTION(optarg_eq_NULL) +EXCEPTION(str_end_not_eq_NULL) +EXCEPTION(ERANGE_) +EXCEPTION(InvalidArgument) +EXCEPTION(InvalidOption) +EXCEPTION(UnexpectedFileArgumentCount) +EXCEPTION(UnexpectedFlagOption) +EXCEPTION(UnexpectedFunctionTypeOption) +EXCEPTION(UnexpectedFunctionTypeTypeOption) +} + +namespace Deserialiser { +EXCEPTION(size_t_) +EXCEPTION(not_Stream_good) +EXCEPTION(wchar_t_) +} + +namespace LexicalUnit { +EXCEPTION(TheAnalyses_empty) +} + +namespace Morpheme { +EXCEPTION(TheLemma_empty) +EXCEPTION(TheTags_empty) +} + +namespace Optional { +EXCEPTION(TheOptionalTypePointer_null) +} + +namespace Serialiser { +EXCEPTION(not_Stream_good) +EXCEPTION(size_t_) +EXCEPTION(wchar_t_) +} + +namespace Tag { +EXCEPTION(TheTags_empty) +} + +namespace wchar_t_ExceptionType { +EXCEPTION(EILSEQ_) +} + +#undef EXCEPTION +} +} + +#endif // EXCEPTION_APERTIUM_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/lswpost.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/lswpost.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lswpost.cc (revision 69632) @@ -0,0 +1,402 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/** + * Light Sliding-Window Part of Speech Tagger (LSWPoST) implementation (source) + * + * @author Gang Chen - pkuchengang@gmail.com + */ + + +#include +#include +#include "apertium_config.h" +#include +#include + +#ifdef WIN32 +#define isnan(n) _isnan(n) +#define isinf(n) (!_finite(n)) +#endif + +#ifdef __clang__ +#undef __GNUC__ +#endif + +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace Apertium; +using namespace tagger_utils; + +void LSWPoST::deserialise(FILE *Serialised_FILE_Tagger) { + tdlsw.read(Serialised_FILE_Tagger); + eos = (tdlsw.getTagIndex())[L"TAG_SENT"]; +} + +std::vector &LSWPoST::getArrayTags() { + return tdlsw.getArrayTags(); +} + +void LSWPoST::serialise(FILE *Stream_) { tdlsw.write(Stream_); } + +void LSWPoST::deserialise(const TaggerData &Deserialised_FILE_Tagger) { + tdlsw = TaggerDataLSW(Deserialised_FILE_Tagger); + eos = (tdlsw.getTagIndex())[L"TAG_SENT"]; +} + +void LSWPoST::init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *UntaggedCorpus) { + std::abort(); +} + +void LSWPoST::init_probabilities_kupiec_(FILE *Corpus) { + init_probabilities(Corpus); +} + +void LSWPoST::train(FILE *Corpus, unsigned long Count) { + for (; Count > 0; --Count) { + std::fseek(Corpus, 0, SEEK_SET); + train(Corpus); + } +} + +LSWPoST::LSWPoST() {} + +LSWPoST::LSWPoST(TaggerDataLSW t) { + tdlsw = t; + eos = (tdlsw.getTagIndex())[L"TAG_SENT"]; +} + +LSWPoST::~LSWPoST() {} + +LSWPoST::LSWPoST(TaggerDataLSW *tdlsw) : tdlsw(*tdlsw) {} + +void +LSWPoST::set_eos(TTag t) { + eos = t; +} + +void +LSWPoST::init_probabilities(FILE *ftxt) { + + int N = tdlsw.getN(); + int nw = 0; + TaggerWord *word = NULL; + set tags_left, tags_mid, tags_right; + set::iterator iter_left, iter_mid, iter_right; + vector > > para_matrix(N, vector >(N, vector(N, 0))); + MorphoStream morpho_stream(ftxt, true, &tdlsw); + int num_valid_seq = 0; + + word = new TaggerWord(); // word for tags left + word->add_tag(eos, L"sent", tdlsw.getPreferRules()); + tags_left = word->get_tags(); // tags left + if (tags_left.size()==0) { //This is an unknown word + tags_left = tdlsw.getOpenClass(); + } + + require_ambiguity_class(tdlsw, tags_left, *word, nw); + ++nw; + delete word; + word = morpho_stream.get_next_word(); // word for tags mid + tags_mid = word->get_tags(); // tags mid + if (tags_mid.size()==0) { //This is an unknown word + tags_mid = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_mid, *word, nw); + ++nw; + delete word; + if (morpho_stream.getEndOfFile()) { + return; + } + + word = morpho_stream.get_next_word(); // word for tags right + + // count each element of the para matrix + while (word != NULL) { + if (++nw % 10000 == 0) { + wcerr << L'.' << flush; + } + + tags_right = word->get_tags(); // tags right + if (tags_right.size()==0) { //This is an unknown word + tags_right = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_right, *word, nw); + + num_valid_seq = tags_left.size() * tags_mid.size() * tags_right.size(); + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + if (!is_valid_seq(*iter_left, *iter_mid, *iter_right)) { + --num_valid_seq; + } + } // for iter_right + } // for iter_mid + } // for iter_left + + if (num_valid_seq != 0) { + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + if (is_valid_seq(*iter_left, *iter_mid, *iter_right)) { + para_matrix[*iter_left][*iter_mid][*iter_right] += 1.0 / num_valid_seq; + } + } // for iter_right + } // for iter_mid + } // for iter_left + } + + tags_left = tags_mid; + tags_mid = tags_right; + delete word; + word = morpho_stream.get_next_word(); + } // while word != NULL + + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + for (int k = 0; k < N; ++k) { + tdlsw.getD()[i][j][k] = para_matrix[i][j][k]; + } + } + } + + wcerr << L"\n"; +} + +bool LSWPoST::is_valid_seq(TTag left, TTag mid, TTag right) { + + vector &forbid_rules = tdlsw.getForbidRules(); + vector &enforce_rules = tdlsw.getEnforceRules(); + + for (size_t r = 0; r < forbid_rules.size(); ++r) { + if ((left == forbid_rules[r].tagi && mid == forbid_rules[r].tagj) + || (mid == forbid_rules[r].tagi && right == forbid_rules[r].tagj)) { + return false; + } + }// for r in forbid rules + + for (size_t r = 0; r < enforce_rules.size(); ++r) { + if (left == enforce_rules[r].tagi) { + bool found = false; + for (size_t j = 0; j < enforce_rules[r].tagsj.size(); ++j) { + if (enforce_rules[r].tagsj[j] == mid) { + found = true; + break; + } + } + if (!found) { + return false; + } + } else if (mid == enforce_rules[r].tagi) { + bool found = false; + for (size_t j = 0; j < enforce_rules[r].tagsj.size(); ++j) { + if (enforce_rules[r].tagsj[j] == right) { + found = true; + break; + } + } + if (!found) { + return false; + } + } + } // for r in enforce rules + + return true; +} + +void +LSWPoST::read_dictionary(FILE *fdic) { + tagger_utils::read_dictionary(fdic, tdlsw); + int N = (tdlsw.getTagIndex()).size(); + int M = (tdlsw.getOutput()).size(); + wcerr << N << L" states and " << M < tags_left, tags_mid, tags_right; + set::iterator iter_left, iter_mid, iter_right; + vector > > para_matrix_new(N, vector >(N, vector(N, 0))); + MorphoStream morpho_stream(ftxt, true, &tdlsw); + + word = new TaggerWord(); // word for tags left + word->add_tag(eos, L"sent", tdlsw.getPreferRules()); + tags_left = word->get_tags(); // tags left + if (tags_left.size()==0) { //This is an unknown word + tags_left = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_left, *word, nw); + ++nw; + delete word; + word = morpho_stream.get_next_word(); // word for tags mid + tags_mid = word->get_tags(); // tags mid + if (tags_mid.size()==0) { //This is an unknown word + tags_mid = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_mid, *word, nw); + ++nw; + delete word; + if (morpho_stream.getEndOfFile()) { + return; + } + + word = morpho_stream.get_next_word(); // word for tags right + + while (word) { + if (++nw % 10000 == 0) { + wcerr << L'.' << flush; + } + + tags_right = word->get_tags(); // tags right + if (tags_right.size()==0) { //This is an unknown word + tags_right = tdlsw.getOpenClass(); + } + require_ambiguity_class(tdlsw, tags_right, *word, nw); + + double normalization = 0; + + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + normalization += tdlsw.getD()[*iter_left][*iter_mid][*iter_right]; + } + } + } + + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + if (normalization > ZERO) { + para_matrix_new[*iter_left][*iter_mid][*iter_right] += + tdlsw.getD()[*iter_left][*iter_mid][*iter_right] / normalization; + } + } + } + } + + tags_left = tags_mid; + tags_mid = tags_right; + delete word; + word = morpho_stream.get_next_word(); + } + + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + for (int k = 0; k < N; ++k) { + tdlsw.getD()[i][j][k] = para_matrix_new[i][j][k]; + } + } + } +} + +void +LSWPoST::print_para_matrix() { + wcout << L"para matrix D\n----------------------------\n"; + for (int i = 0; i < tdlsw.getN(); ++i) { + for (int j = 0; j < tdlsw.getN(); ++j) { + for (int k = 0; k < tdlsw.getN(); ++k) { + wcout << L"D[" << i << L"][" << j << L"][" << k << L"] = " + << tdlsw.getD()[i][j][k] << "\n"; + } + } + } +} + +void +LSWPoST::tagger(FILE *Input, FILE *Output, const bool &First) { + TaggerWord *word_left = NULL, *word_mid = NULL, *word_right = NULL; + set tags_left, tags_mid, tags_right; + set::iterator iter_left, iter_mid, iter_right; + MorphoStream morpho_stream(Input, debug, &tdlsw); + morpho_stream.setNullFlush(null_flush); + + word_left = new TaggerWord(); // word left + word_left->add_tag(eos, L"sent", tdlsw.getPreferRules()); + word_left->set_show_sf(show_sf); + tags_left = word_left->get_tags(); // tags left + + warn_absent_ambiguity_class(tdlsw, tags_left, *word_left, debug); + word_mid = morpho_stream.get_next_word(); // word mid + word_mid->set_show_sf(show_sf); + tags_mid = word_mid->get_tags(); // tags mid + + warn_absent_ambiguity_class(tdlsw, tags_mid, *word_mid, debug); + if (morpho_stream.getEndOfFile()) { + delete word_left; + delete word_mid; + return; + } + word_right = morpho_stream.get_next_word(); // word_right + word_right->set_show_sf(show_sf); + + wstring micad; + + while (word_right) { + tags_right = word_right->get_tags(); + warn_absent_ambiguity_class(tdlsw, tags_right, *word_right, debug); + + double max = -1; + TTag tag_max = *tags_mid.begin(); + for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) { + double n = 0; + for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) { + for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) { + n += tdlsw.getD()[*iter_left][*iter_mid][*iter_right]; + } + } + if (n > max) { + max = n; + tag_max = *iter_mid; + } + } + + micad = word_mid->get_lexical_form(tag_max, (tdlsw.getTagIndex())[L"TAG_kEOF"]); + fputws_unlocked(micad.c_str(), Output); + if (morpho_stream.getEndOfFile()) { + if (null_flush) { + fputwc_unlocked(L'\0', Output); + } + fflush(Output); + morpho_stream.setEndOfFile(false); + } + + delete word_left; + word_left = word_mid; + tags_left = tags_mid; + word_mid = word_right; + tags_mid = tags_right; + word_right = morpho_stream.get_next_word(); + if (word_right != NULL) { + word_right->set_show_sf(show_sf); + } + } + delete word_left; + delete word_mid; +} Index: branches/apertium-tagger/apertium2/apertium/tagger_utils.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_utils.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_utils.cc (revision 69632) @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include + +#include +#include +#include +#include +#include +#ifdef _MSC_VER +#define wcstok wcstok_s +#endif +#ifdef __MINGW32__ + +wchar_t *_wcstok(wchar_t *wcs, const wchar_t *delim, wchar_t **ptr) { + (void)ptr; + return wcstok(wcs, delim); +} + +#define wcstok _wcstok +#endif + +using namespace Apertium; + + +void tagger_utils::fatal_error (wstring const &s) { + wcerr< v[], int l) { + for(int i=0; i0)&&(s.at(s.length()-1)==L' ')) + s.erase(s.length()-1,1); + if ((s.length()>0)&&(s.at(0)==L' ')) + s.erase(0,1); + + return s; +} + +void +tagger_utils::read_dictionary(FILE *fdic, TaggerData &td) { + int i, k, nw = 0; + TaggerWord *word = NULL; + set tags; + Collection &output = td.getOutput(); + + MorphoStream morpho_stream(fdic, true, &td); + + // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark + + word = morpho_stream.get_next_word(); + + while (word) { + if (++nw % 10000 == 0) + wcerr << L'.' << flush; + + tags = word->get_tags(); + + if (tags.size() > 0) + k = output[tags]; + + delete word; + word = morpho_stream.get_next_word(); + } + wcerr << L"\n"; + + // OPEN AMBIGUITY CLASS + // It contains all tags that are not closed. + // Unknown words are assigned the open ambiguity class + k = output[td.getOpenClass()]; + + // Create ambiguity class holding one single tag for each tag. + // If not created yet + int N = (td.getTagIndex()).size(); + for(i = 0; i != N; i++) { + set amb_class; + amb_class.insert(i); + k = output[amb_class]; + } +} + +set +tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { + set &ret = td.getOpenClass(); + Collection &output = td.getOutput(); + + for (int k=0; k &ambg_class = output[k]; + if (ambg_class.size() >= ret.size()) { + continue; + } + if (includes(ambg_class.begin(), ambg_class.end(), c.begin(), c.end())) { + ret = ambg_class; + } + } + return ret; +} + +void +tagger_utils::require_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, int nw) { + if (td.getOutput().has_not(tags)) { + wstring errors; + errors = L"A new ambiguity class was found. I cannot continue.\n"; + errors+= L"Word '" + word.get_superficial_form() + L"' not found in the dictionary.\n"; + errors+= L"New ambiguity class: " + word.get_string_tags() + L"\n"; + if (nw >= 0) { + std::wostringstream ws; + ws << (nw + 1); + errors+= L"Line number: " + ws.str() + L"\n"; + } + errors+= L"Take a look at the dictionary, then retrain."; + fatal_error(errors); + } +} + +static void _warn_absent_ambiguity_class(TaggerWord &word) { + wstring errors; + errors = L"A new ambiguity class was found. \n"; + errors += L"Retraining the tagger is necessary so as to take it into account.\n"; + errors += L"Word '" + word.get_superficial_form() + L"'.\n"; + errors += L"New ambiguity class: " + word.get_string_tags() + L"\n"; + wcerr << L"Error: " << errors; +} + +set +tagger_utils::require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags)) { + if (debug) { + _warn_absent_ambiguity_class(word); + } + return find_similar_ambiguity_class(td, tags); + } + return tags; +} + +void +tagger_utils::warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags) && debug) { + _warn_absent_ambiguity_class(word); + } +} + +template +ostream& operator<< (ostream& os, const map & f){ + typename map ::const_iterator it; + os<first<<' '<second; + return os; +} + +template +istream& operator>> (istream& is, map & f) { + int n, i, k; + f.clear(); + is>>n; + for (k=0; k>i; // warning: does not work if both + is>>f[i]; // lines merged in a single one + } + if (is.bad()) tagger_utils::fatal_error(L"reading map"); + return is; +} + +template +ostream& operator<< (ostream& os, const set& s) { + typename set::iterator it = s.begin(); + os<<'{'; + if (it!=s.end()) { + os<<*it; + while (++it!=s.end()) os<<','<<*it; + } + os<<'}'; + return os; +} + Index: branches/apertium-tagger/apertium2/apertium/tagger_utils.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_utils.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_utils.h (revision 69632) @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef __TAGGERUTILS_H +#define __TAGGERUTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace tagger_utils +{ +/** Print a fatal error message + * @param s the error message to print + */ +void fatal_error (wstring const &s); + +/** Print a fatal error message related to a file + * @param s the file name to be printted in the error message + */ +void file_name_error (string const &s); + +/** Convert from int to string + * @param i the int value to convert + * @return an string representing the number recived as input + */ +char *itoa(int i); + +/** Make all array positions equal to zero + * @param a the array + * @param l length of the array a + */ +void clear_array_double(double a[], int l); + +/** Clear all vectors stored in array v + * @param v array of vectors + * @param l length of the array v + */ +void clear_array_vector(vector v[], int l); + +/** Return the number of tokens in the multiword unit + */ + int ntokens_multiword(wstring const &s); + +/** Devuelve el nș de guiones que contiene la cadena pasada como argumento + */ +int nguiones_fs(wstring const &cadena); + +/** Reads the expanded dictionary received as a parameter puts the resulting + * ambiguity classes that the tagger will manage. + * @param fdic the input stream with the expanded dictionary to read + * @param td the tagger data instance to mutate + */ +void read_dictionary(FILE *fdic, TaggerData &td); + +/** This method returns a known ambiguity class that is a subset of +* the one received as a parameter. This is useful when a new +* ambiguity class is found because of changes in the morphological +* dictionary used by the MT system. +* @param c set of tags (ambiguity class) +* @return a known ambiguity class +*/ +set find_similar_ambiguity_class(TaggerData &td, set &c); + +/** Dies with an error message if the tags aren't in the tagger data */ +void require_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, int nw); + +/** As with find_similar_ambiguity_class, but returns tags if it's already fine + * & prints a warning if debug */ +set require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); + +/** Just prints a warning if debug */ +void warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); + +wstring trim(wstring s); + +}; + +template +ostream& operator<< (ostream& os, const map & f); +template +istream& operator>> (istream& is, map & f); +template +ostream& operator<< (ostream& os, const set& s); + +#endif Index: branches/apertium-tagger/apertium2/apertium/hmm.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/hmm.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/hmm.cc (revision 69632) @@ -0,0 +1,872 @@ + +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/* + * First order hidden Markov model (HMM) implementation (source) + * + * @author Felipe SĂĄnchez-MartĂ­nez - fsanchez@dlsi.ua.es + */ + +#include +#include +#include "apertium_config.h" +#include +#include + +#ifdef WIN32 +#define isnan(n) _isnan(n) +#define isinf(n) (!_finite(n)) +#endif + +#ifdef __clang__ +#undef __GNUC__ +#endif + +#include +#include +#include +#include +#include + +using namespace Apertium; +using namespace tagger_utils; + +void HMM::deserialise(FILE *Serialised_FILE_Tagger) { + tdhmm.read(Serialised_FILE_Tagger); + eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; +} + +std::vector &HMM::getArrayTags() { + return tdhmm.getArrayTags(); +} + +void HMM::serialise(FILE *Stream_) { tdhmm.write(Stream_); } + +void HMM::deserialise(const TaggerData &Deserialised_FILE_Tagger) { + tdhmm = TaggerDataHMM(Deserialised_FILE_Tagger); + eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; +} + +void HMM::init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *UntaggedCorpus) { + init_probabilities_from_tagged_text(TaggedCorpus, UntaggedCorpus); + apply_rules(); +} + +void HMM::init_probabilities_kupiec_(FILE *Corpus) { + init_probabilities_kupiec(Corpus); + apply_rules(); +} + +void HMM::train(FILE *Corpus, unsigned long Count) { + for (; Count > 0; --Count) { + std::fseek(Corpus, 0, SEEK_SET); + train(Corpus); + } + + apply_rules(); +} + +HMM::HMM() {} + +HMM::HMM(TaggerDataHMM tdhmm) +{ + tdhmm = tdhmm; + eos = (tdhmm.getTagIndex())[L"TAG_SENT"]; +} + +HMM::HMM(TaggerDataHMM *tdhmm) : tdhmm(*tdhmm) {} + +HMM::~HMM() {} + +void +HMM::init() +{ +} + +void +HMM::set_eos(TTag t) +{ + eos = t; +} + +void +HMM::read_ambiguity_classes(FILE *in) +{ + while(in) + { + int ntags = Compression::multibyte_read(in); + + if(feof(in)) + { + break; + } + set ambiguity_class; + + for(; ntags != 0; ntags--) + { + ambiguity_class.insert(Compression::multibyte_read(in)); + } + + if(ambiguity_class.size() != 0) + { + tdhmm.getOutput().add(ambiguity_class); + } + } + + tdhmm.setProbabilities(tdhmm.getTagIndex().size(), tdhmm.getOutput().size()); +} + +void +HMM::write_ambiguity_classes(FILE *out) +{ + for(int i=0, limit = tdhmm.getOutput().size(); i != limit; i++) + { + set const &ac = (tdhmm.getOutput())[i]; + Compression::multibyte_write(ac.size(), out); + for(set::const_iterator it = ac.begin(), limit2 = ac.end(); + it != limit2; it++) + { + Compression::multibyte_write(*it, out); + } + } +} + +void +HMM::read_probabilities(FILE *in) +{ + tdhmm.read(in); +} + +void +HMM::write_probabilities(FILE *out) +{ + tdhmm.write(out); +} + +void +HMM::init_probabilities_kupiec (FILE *is) +{ + int N = tdhmm.getN(); + int M = tdhmm.getM(); + int i=0, j=0, k=0, k1=0, k2=0, nw=0; +#ifdef __GNUC__ + double classes_ocurrences[M]; //M = Number of ambiguity classes + double classes_pair_ocurrences[M][M]; + double tags_estimate[N]; //N = Number of tags (states) + double tags_pair_estimate[N][N]; +#else + vector classes_ocurrences (M, 1); + vector > classes_pair_ocurrences(M, vector(M, 1)); + vector tags_estimate(N, 0); + vector > tags_pair_estimate(N, vector(N, 0)); +#endif + + Collection &output = tdhmm.getOutput(); + + MorphoStream lexmorfo(is, true, &tdhmm); + + TaggerWord *word=NULL; + +#ifdef __GNUC__ + for(k=0; k tags; + tags.insert(eos); + k1=output[tags]; //The first tag (ambiguity class) seen is the end-of-sentence + + //We count for each ambiguity class the number of ocurrences + word = lexmorfo.get_next_word(); + while((word)) { + if (++nw%10000==0) wcerr<get_tags(); + + if (tags.size()==0) { //This is an unknown word + tags = tdhmm.getOpenClass(); + } + else { + require_ambiguity_class(tdhmm, tags, *word, nw); + } + + k2=output[tags]; + + classes_ocurrences[k1]++; + classes_pair_ocurrences[k1][k2]++; //k1 followed by k2 + delete word; + word=lexmorfo.get_next_word(); + + k1=k2; + + } + + //Estimation of the number of time each tags occurs in the training text + for(i=0; i tags1, tags2; + set::iterator itag1, itag2; + for(k1=0; k10) + (tdhmm.getA())[i][j] = tags_pair_estimate[i][j]/sum; + else { + (tdhmm.getA())[i][j] = 0; + } + } + } + + //b[i][k] estimation + for(i=0; i0) + (tdhmm.getB())[i][k] = (classes_ocurrences[k]/output[k].size())/tags_estimate[i]; + else + (tdhmm.getB())[i][k] = 0; + } + } + } + wcerr< > tags_pair(N, vector(N, 0)); + vector > emission(N, vector(M, 0)); +#endif + + + MorphoStream stream_tagged(ftagged, true, &tdhmm); + MorphoStream stream_untagged(funtagged, true, &tdhmm); + + TaggerWord *word_tagged=NULL, *word_untagged=NULL; + Collection &output = tdhmm.getOutput(); + + + set tags; + +#ifdef __GNUC__ + // Init counters - each event appears at least once. + // Espected likelihood estimate (ELE) with a fixed initial count of 1 + for(i=0; iget_superficial_form()!=word_untagged->get_superficial_form()) { + wcerr<get_tags().size()==0) // Unknown word + tag1 = -1; + else if (word_tagged->get_tags().size()>1) // Ambiguous word + wcerr<get_superficial_form()<get_tags()).begin(); + + + if ((tag1>=0) && (tag2>=0)) + tags_pair[tag2][tag1]++; + + + if (word_untagged->get_tags().size()==0) { // Unknown word + tags = tdhmm.getOpenClass(); + } + else { + require_ambiguity_class(tdhmm, word_untagged->get_tags(), *word_untagged, nw); + tags = word_untagged->get_tags(); + } + + k=output[tags]; + if(tag1>=0) + emission[tag1][k]++; + + delete word_tagged; + word_tagged=stream_tagged.get_next_word(); + delete word_untagged; + word_untagged=stream_untagged.get_next_word(); + } + + + //Estimate of a[i][j] + for(i=0; i &forbid_rules = tdhmm.getForbidRules(); + vector &enforce_rules = tdhmm.getEnforceRules(); + int N = tdhmm.getN(); + int i, j, j2; + bool found; + + for(i=0; i<(int) forbid_rules.size(); i++) { + (tdhmm.getA())[forbid_rules[i].tagi][forbid_rules[i].tagj] = ZERO; + } + + for(i=0; i<(int) enforce_rules.size(); i++) { + for(j=0; j0) + (tdhmm.getA())[i][j] = (tdhmm.getA())[i][j]/sum; + else + (tdhmm.getA())[i][j] = 0; + } + } +} + +void +HMM::read_dictionary(FILE *fdic) { + tagger_utils::read_dictionary(fdic, tdhmm); + int N = (tdhmm.getTagIndex()).size(); + int M = (tdhmm.getOutput()).size(); + wcerr << N << L" states and " << M < > ambiguity_classes; + MorphoStream morpho_stream(in, true, &tdhmm); + + TaggerWord *word = morpho_stream.get_next_word(); + + while(word) { + set tags = word->get_tags(); + if(tags.size() > 0) { + if(ambiguity_classes.find(tags) == ambiguity_classes.end()) { + ambiguity_classes.insert(tags); + word->outputOriginal(out); + //wcerr<get_string_tags()< tags, pretags; + set::iterator itag, jtag; + map gamma; + map ::iterator jt, kt; + map < int, map > alpha, beta, xsi, phi; + map < int, map >::iterator it; + double prob, loli; + vector < set > pending; + Collection &output = tdhmm.getOutput(); + + int ndesconocidas=0; + // alpha => forward probabilities + // beta => backward probabilities + + MorphoStream morpho_stream(ftxt, true, &tdhmm); + + loli = 0; + tag = eos; + tags.clear(); + tags.insert(tag); + pending.push_back(tags); + + alpha[0].clear(); + alpha[0][tag] = 1; + + word = morpho_stream.get_next_word(); + + while (word) { + + //wcerr<get_tags(); + + if (tags.size()==0) { // This is an unknown word + tags = tdhmm.getOpenClass(); + ndesconocidas++; + } + + require_ambiguity_class(tdhmm, tags, *word, nw); + + k = output[tags]; + len = pending.size(); + alpha[len].clear(); + + //Forward probabilities + for (itag=tags.begin(); itag!=tags.end(); itag++) { + i=*itag; + for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) { + j=*jtag; + //cerr<<"previous alpha["<1) { + pending.push_back(tags); + } else { // word is unambiguous + tag = *tags.begin(); + beta[0].clear(); + beta[0][tag] = 1; + + prob = alpha[len][tag]; + + //cerr<<"prob="<1) || ((tag!=eos)&&(tag != (tdhmm.getTagIndex())[L"TAG_kEOF"]))) + wcerr<first; + for (jt=xsi[i].begin(); jt!=xsi[i].end(); jt++) { + j = jt->first; + if (xsi[i][j]>0) { + if (gamma[i]==0) { + wcerr<first; + for (kt=phi[i].begin(); kt!=phi[i].end(); kt++) { + k = kt->first; + if (phi[i][k]>0) { + (tdhmm.getB())[i][k] = phi[i][k]/gamma[i]; + + if (isnan((tdhmm.getB())[i][k])) { + wcerr< ambg_class_tags, tags, pretags; + set ::iterator itag, jtag; + + double prob, loli, x; + int N = tdhmm.getN(); +#ifdef __GNUC__ + double alpha[2][N]; + vector best[2][N]; +#else + vector > alpha(2, vector(N)); + vector > > best(2, vector >(N)); +#endif + + vector wpend; + int nwpend; + + MorphoStream morpho_stream(Input, debug, &tdhmm); + morpho_stream.setNullFlush(null_flush); + + Collection &output = tdhmm.getOutput(); + + loli = nw = 0; + + //Initialization + tags.insert(eos); + alpha[0][eos] = 1; + + word = morpho_stream.get_next_word(); + + while (word) { + wpend.push_back(*word); + nwpend = wpend.size(); + + pretags = tags; // Tags from the previous word + + tags = word->get_tags(); + + if (tags.size()==0) // This is an unknown word + tags = tdhmm.getOpenClass(); + + ambg_class_tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug); + + k = output[ambg_class_tags]; //Ambiguity class the word belongs to + +#ifdef __GNUC__ + clear_array_double(alpha[nwpend%2], N); + clear_array_vector(best[nwpend%2], N); +#else + clear_array_double(&alpha[nwpend%2][0], N); + clear_array_vector(&best[nwpend%2][0], N); +#endif + + //Induction + for (itag=tags.begin(); itag!=tags.end(); itag++) { //For all tag from the current word + i=*itag; + for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) { //For all tags from the previous word + j=*jtag; + x = alpha[1-nwpend%2][j]*(tdhmm.getA())[j][i]*(tdhmm.getB())[i][k]; + if (alpha[nwpend%2][i]<=x) { + if (nwpend>1) + best[nwpend%2][i] = best[1-nwpend%2][j]; + best[nwpend%2][i].push_back(i); + alpha[nwpend%2][i] = x; + } + } + } + + //Backtracking + if (tags.size()==1) { + tag = *tags.begin(); + + prob = alpha[nwpend%2][tag]; + + if (prob>0) + loli -= log(prob); + else { + if (debug) + wcerr<get_superficial_form()<get_string_tags()<1)&&(debug)) { + wstring errors; + errors = L"The text to disambiguate has finished, but there are ambiguous words that has not been disambiguated.\n"; + errors+= L"This message should never appears. If you are reading this ..... these are very bad news.\n"; + wcerr< ambiguity_class; + set::iterator itag; + cout<<"AMBIGUITY CLASSES\n-------------------------------\n"; + for(int i=0; i != tdhmm.getM(); i++) { + ambiguity_class = (tdhmm.getOutput())[i]; + cout < "/dev/stderr" + guesswarned=1 + } + if(seen[filename]) { + print "apertium-createmodes.awk: "filename" seen twice" > "/dev/stderr" + filename = 0 + } + else { + print "" > filename + seen[filename] = 1 + } + next +} + +filename { + print $0 >> filename + close(filename) +} Property changes on: branches/apertium-tagger/apertium2/apertium/apertium-createmodes.awk ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/apertium-tagger/apertium2/apertium/modes.dtd =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes.dtd (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes.dtd (revision 69632) @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/modes.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes.rnc (revision 69632) @@ -0,0 +1,33 @@ +# Copyright (C) 2005-2016 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# DTD for the modes.xml file + +modes = element modes { attlist.modes, mode+ } +attlist.modes &= empty +mode = element mode { attlist.mode, pipeline } +attlist.mode &= attribute name { xsd:ID } +attlist.mode &= attribute install { text }? +attlist.mode &= attribute gendebug { text }? +pipeline = element pipeline { attlist.pipeline, program+ } +attlist.pipeline &= empty +program = element program { attlist.program, (file | arg)* } +attlist.program &= attribute name { text } +attlist.program &= attribute debug-suff { text }? +file = element file { attlist.file, empty } +attlist.file &= attribute name { text } +arg = element arg { attlist.arg, empty } +attlist.arg &= attribute name { text } +start = modes Index: branches/apertium-tagger/apertium2/apertium/modes.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes.rng (revision 69632) @@ -0,0 +1,106 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/modes2bash.xsl =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes2bash.xsl (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes2bash.xsl (revision 69632) @@ -0,0 +1,99 @@ + + + + + + + + + + + + + + + + + + + +# + + .mode + + + + + + + + ' + + ' + + +# modes/ + + .mode + + + + + + + + + + + + | + + + + + + + + + + + + + + + + + + + + + + + + + ' + + / + + ' + + + + Index: branches/apertium-tagger/apertium2/apertium/modes2debugmodes.xsl =================================================================== --- branches/apertium-tagger/apertium2/apertium/modes2debugmodes.xsl (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/modes2debugmodes.xsl (revision 69632) @@ -0,0 +1,167 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + **************** + + : **************** + + + + + + + + + + + + + - + + + -disam + + + -tagger + + + -pretransfer + + + -lex + + + -chunker + + + -interchunk + + + -postchunk + + + -dgen + + + -biltrans + + + -pgen + + + -morph + + + -morph + + + -NAMEME + + + + + + + + + + -t + + + -t + + + -t + + + -t + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/apertium_gen_wlist_lextor_translation.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_gen_wlist_lextor_translation.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_gen_wlist_lextor_translation.cc (revision 69632) @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante + * + * author: Felipe Sánchez-Martínez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include "getopt_long.h" +#include + +#include + +#include +#include +#include + +using namespace Apertium; +using namespace std; + + +void help(char *name) { + wcerr<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + if(monodic_file=="") { + wcerr<\n"; + for (int i=0; i. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "getopt_long.h" + +#ifdef _MSC_VER +#include +#include +#endif + +using namespace Apertium; +using namespace std; + +void message(char *progname) +{ + cerr << "USAGE: " << basename(progname) << " [-tz] t2x preproc [input [output]]" << endl; + cerr << " t2x t2x rules file" << endl; + cerr << " preproc result of preprocess trules file" << endl; + cerr << " input input file, standard input by default" << endl; + cerr << " output output file, standard output by default" << endl; + cerr << "OPTIONS" <. + */ + +#include +#include +#include "getopt_long.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace Apertium; + +#define MODE_TRAINWRD 0 +#define MODE_TRAINLCH 1 +#define MODE_LEXTOR 2 +#define MODE_LEXTORTL 3 + +using namespace std; + + +void help(char *name) { + cerr<<"USAGE:\n"; + cerr<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + if (weight_exponent<0) { + wcerr<. + */ + +#include +#include +#include "getopt_long.h" + +#include + +#include +#include +#include +#include +#include +#include + +using namespace Apertium; +#define MODE_LEXTOR 1 +#define MODE_LEXTORTL 2 + +using namespace std; + + +void help(char *name) { + cerr<<"USAGE:\n"; + cerr<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + cerr<<"TH ANGLE: "<. + */ +#include +#include + +#include +#include "getopt_long.h" +#include +#include +#include +#include +#include +#include +#ifdef _MSC_VER +#include +#include +#endif + +using namespace Apertium; +using namespace std; + +void message(char *progname) +{ + cerr << "USAGE: " << basename(progname) << " [-z] t3x preproc [input [output]]" << endl; + cerr << " t3x t3x rules file" << endl; + cerr << " preproc result of preprocess trules file" << endl; + cerr << " input input file, standard input by default" << endl; + cerr << " output output file, standard output by default" << endl; + cerr << "OPTIONS" <. + */ +#include +#include +#include +#include +#include +#include "getopt_long.h" + +#include +#include "apertium_config.h" +#include + +#ifdef _MSC_VER +#include +#include +#endif +#include + +using namespace Apertium; +using namespace std; + +bool compound_sep = false; + +void readAndWriteUntil(FILE *input, FILE *output, int const charcode) +{ + int mychar; + + while((mychar = fgetwc_unlocked(input)) != charcode) + { + if(feof(input)) + { + wcerr << L"ERROR: Unexpected EOF" << endl; + exit(EXIT_FAILURE); + } + fputwc_unlocked(mychar, output); + if(mychar == L'\\') + { + mychar = fgetwc(input); + fputwc(mychar, output); + } + } +} + +void procWord(FILE *input, FILE *output, bool surface_forms) +{ + int mychar; + wstring buffer = L""; + + bool buffer_mode = false; + bool in_tag = false; + bool queuing = false; + + if(surface_forms) + { + while((mychar = fgetwc_unlocked(input)) != L'/') ; + } + + while((mychar = fgetwc_unlocked(input)) != L'$') + { + if(feof(input)) + { + wcerr << L"ERROR: Unexpected EOF" << endl; + exit(EXIT_FAILURE); + } + + switch(mychar) + { + case L'<': + in_tag = true; + if(!buffer_mode) + { + buffer_mode = true; + } + break; + + case L'>': + in_tag = false; + break; + + case L'#': + if(buffer_mode) + { + buffer_mode = false; + queuing = true; + } + break; + } + + if(buffer_mode) + { + if((mychar != L'+' || (mychar == L'+' && in_tag == true)) && + (mychar != L'~' || (mychar == L'~' && in_tag == true))) + { + buffer += static_cast(mychar); + } + else if(in_tag == false && mychar == L'+') + { + buffer.append(L"$ ^"); + } + else if(in_tag == false && mychar == L'~' and compound_sep == true) + { + buffer.append(L"$^"); + } + } + else + { + if(mychar == L'+' && queuing == true) + { + buffer.append(L"$ ^"); + buffer_mode = true; + } + else + { + fputwc_unlocked(mychar, output); + } + } + + } + fputws_unlocked(buffer.c_str(), output); +} + +void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms) +{ + while(true) + { + int mychar = fgetwc_unlocked(input); + if(feof(input)) + { + break; + } + switch(mychar) + { + case L'[': + fputwc_unlocked(L'[', output); + readAndWriteUntil(input, output, L']'); + fputwc_unlocked(L']', output); + break; + + case L'\\': + fputwc_unlocked(mychar, output); + fputwc_unlocked(fgetwc_unlocked(input), output); + break; + + case L'^': + fputwc_unlocked(mychar, output); + procWord(input, output, surface_forms); + fputwc_unlocked(L'$', output); + break; + + case L'\0': + fputwc_unlocked(mychar, output); + + if(null_flush) + { + fflush(output); + } + break; + + default: + fputwc_unlocked(mychar, output); + break; + } + } +} + +void usage(char *progname) +{ + wcerr << L"USAGE: " << basename(progname) << L" [input_file [output_file]]" << endl; + exit(EXIT_FAILURE); +} + + + + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + bool null_flush = false; + bool surface_forms = false; + + int option_index=0; + + while (true) { + static struct option long_options[] = + { + {"null-flush", no_argument, 0, 'z'}, + {"no-surface-forms", no_argument, 0, 'n'}, + {"compounds", no_argument, 0, 'e'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int c=getopt_long(argc, argv, "enzh", long_options, &option_index); + if (c==-1) + break; + + switch (c) + { + case 'z': + null_flush = true; + break; + + case 'e': + compound_sep = true; + break; + + case 'n': + surface_forms = true; + break; + + case 'h': + default: + usage(argv[0]); + break; + } + } + + if((argc-optind+1) > 3) + { + usage(argv[0]); + } + + FILE *input, *output; + + if((argc-optind+1) == 1) + { + input = stdin; + output = stdout; + } + else if ((argc-optind+1) == 2) + { + input = fopen(argv[argc-1], "r"); + if(!input) + { + usage(argv[0]); + } + output = stdout; + } + else + { + input = fopen(argv[argc-2], "r"); + output = fopen(argv[argc-1], "w"); + + if(!input || !output) + { + usage(argv[0]); + } + } + + if(feof(input)) + { + wcerr << L"ERROR: Can't read file '" << argv[1] << L"'" << endl; + exit(EXIT_FAILURE); + } + +#ifdef _MSC_VER + _setmode(_fileno(input), _O_U8TEXT); + _setmode(_fileno(output), _O_U8TEXT); +#endif + + processStream(input, output, null_flush, surface_forms); +} Index: branches/apertium-tagger/apertium2/apertium/apertium_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_tagger.h (revision 69632) @@ -0,0 +1,88 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef APERTIUM_TAGGER_H +#define APERTIUM_TAGGER_H + +#include "apertium_config.h" + +#include "basic_stream_tagger.h" +#include "basic_stream_tagger_trainer.h" +#include "basic_tagger.h" +#include "constructor_eq_delete.h" +#include "file_tagger.h" +#include "optional.h" + +#include "getopt_long.h" +#include + +namespace Apertium { +class apertium_tagger : private constructor_eq_delete { +public: + apertium_tagger(int &argc, char **&argv); + +private: + enum FunctionTypeType { Unigram, SlidingWindow }; + enum UnigramType { Stream_5_3_1, Stream_5_3_2, Stream_5_3_3 }; + enum FunctionType { Tagger, Retrain, Supervised, Train }; + static void help(); + + + static std::string option_string(const int &indexptr_); + static std::string option_string(const struct option &option_); + + + static void locale_global_(); + + + static const struct option longopts[]; + + + + void set_indexptr(); + + + void flagOptionCase(bool (basic_Tagger::Flags::*GetFlag)() const, + void (basic_Tagger::Flags::*SetFlag)(const bool &)); + std::string option_string(); + void functionTypeTypeOptionCase(const FunctionTypeType &FunctionTypeType_); + void functionTypeOptionCase(const FunctionType &FunctionType_); + void getIterationsArgument(); + unsigned long optarg_unsigned_long() const; + void g_StreamTagger(basic_StreamTagger &StreamTagger_); + void s_StreamTaggerTrainer(basic_StreamTaggerTrainer &StreamTaggerTrainer_); + void g_FILE_Tagger(FILE_Tagger &FILE_Tagger_); + void r_FILE_Tagger(FILE_Tagger &FILE_Tagger_); + void s_FILE_Tagger(FILE_Tagger &FILE_Tagger_); + void t_FILE_Tagger(FILE_Tagger &FILE_Tagger_); + int &argc; + char **&argv; + int The_val; + + + int The_indexptr; + Optional FunctionTypeTypeOption_indexptr; + Optional FunctionTypeOption_indexptr; + + + Optional TheFunctionTypeType; + Optional TheUnigramType; + Optional TheFunctionType; + unsigned long TheFunctionTypeOptionArgument; + basic_Tagger::Flags TheFlags; +}; +} + +#endif // APERTIUM_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/apertium_tagger_apply_new_rules.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_tagger_apply_new_rules.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_tagger_apply_new_rules.cc (revision 69632) @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2004-2006 Felipe Sánchez-Martínez + * Copyright (C) 2006 Universitat d'Alacant + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include "getopt_long.h" + +#include +#include +#include +#include + +using namespace Apertium; + +using namespace std; + +//Global vars +TaggerDataHMM tagger_data_hmm; +TTag eos; //End-of-sentence tag + +void check_file(FILE *f, const string& path) { + if (!f) { + cerr<<"Error: cannot open file '"<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + //Now we check the command line arguments + if (filein=="") { + cerr<<"Error: You did not provide an input file (.prob). Use --filein to do that\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + if (fileout=="") { + cerr<<"Error: You did not provide an output file (.prob). Use --fileout to do that\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + if (filetsx=="") { + cerr<<"Error: You did not provide a tagger definition file (.tsx). Use --filetsx to do that\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + FILE *fin, *fout; + + fin=fopen(filein.c_str(), "rb"); + check_file(fin, filein); + + cerr<<"Reading apertium-tagger data from file '"<. + */ + +/* +#include +#include +#include +#include +#include +*/ + +#include "getopt_long.h" +#include +#include +#include +#include +#include +#include + +#include +#include + + +using namespace std; + +//Global vars +TaggerDataHMM tagger_data_hmm; +bool check_ambclasses; + +void check_file(FILE *f, const string& path) { + if (!f) { + cerr<<"Error: cannot open file '"<get_superficial_form())<<" "<get_string_tags())<<"\n"; + + if (check_ambclasses) { + int k=tagger_data_hmm.getOutput()[word->get_tags()]; + + if ((k>=tagger_data_hmm.getM())||(k<0)) { + cerr<<"Error: Ambiguity class number out of range: "<get_superficial_form())<<"\n"; + cerr<<"Ambiguity class: "<get_string_tags())<<"\n"; + } + } + + delete word; + + if ((corpus_length>0) && (nwords>=corpus_length)) + break; + + word=lexmorfo.get_next_word(); + } + cerr<] < file.crp \n\n"; + + cerr<<"ARGUMENTS: \n" + <<" --tsxfile|-x: Specify a tagger specification file\n" + <<" --probfile|-p: Specify a tagger parameter file\n" + <<" --clength|-l: Specify the length of the corpus to process\n"; +} + + +int main(int argc, char* argv[]) { + string tsxfile=""; + string probfile=""; + int corpus_length=-1; + + int c; + int option_index=0; + + cerr<<"LOCALE: "<.\n"; + exit(EXIT_SUCCESS); + break; + default: + help(argv[0]); + exit(EXIT_FAILURE); + break; + } + } + + if((tsxfile=="") && (probfile=="")) { + cerr<<"Error: You have provided neither a tagger specification file (.tsx) nor a tagger probability file (.prob). Use --tsxfile or --probfile to provide one of them\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + if((tsxfile!="") && (probfile!="")) { + cerr<<"Error: You provided a tagger specification file and a tagger probability file. Only one of them can be provided, not both\n"; + help(argv[0]); + exit(EXIT_FAILURE); + } + + if (tsxfile!="") { + cerr<<"Reading tagger specification from file '"<. + */ +#include +#include "getopt_long.h" +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "apertium_config.h" +#include + +using namespace Apertium; +using namespace std; + +void usage(char *progname) +{ + wcerr << L"USAGE: " << basename(progname) << L" [options] code1 code2 doc1 doc2 [output_file]" << endl; + wcerr << L"Options:" << endl; + wcerr << L" -p percent number 0 < n <= 1 to set margin of confidence of TU's " << endl; + wcerr << L" (0.85 by default) in length terms" << endl; + wcerr << L" -e edit number 0 < n <= 1 to set margin of confidence of TU's " << endl; + wcerr << L" (0.30 by default) in edit distance terms" << endl; + wcerr << L" -l low-limit ignore percent if the segment is less than lowlimit" < 1) + { + usage(argv[0]); + } + break; + case 'e': + edit_distance_percent = strtod(optarg, NULL); + if(edit_distance_percent <= 0 || edit_distance_percent > 1) + { + usage(argv[0]); + } + break; + + case 'l': + low_limit = atoi(optarg); + if(low_limit < 0) + { + usage(argv[0]); + } + break; + + case 'm': + max_edit = atoi(optarg); + if(max_edit < 0) + { + usage(argv[0]); + } + break; + + case 'd': + diagonal_width = atoi(optarg); + if(diagonal_width < 0) + { + usage(argv[0]); + } + break; + + case 'w': + window_size = atoi(optarg); + if(window_size < 0) + { + usage(argv[0]); + } + break; + + case 's': + step = atoi(optarg); + if(step < 0) + { + usage(argv[0]); + } + break; + + case 't': + translation = optarg; + break; + + + default: + //wcerr<. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "getopt_long.h" +#ifdef _MSC_VER +#include +#include +#endif + +using namespace Apertium; +using namespace std; + +void message(char *progname) +{ + cerr << "USAGE: " << basename(progname) << " trules preproc biltrans [input [output]]" << endl; + cerr << " " << basename(progname) << " -b trules preproc [input [output]]" << endl; + cerr << " " << basename(progname) << " -n trules preproc [input [output]]" << endl; + cerr << " " << basename(progname) << " -x extended trules preproc biltrans [input [output]]" << endl; + cerr << " " << basename(progname) << " -c trules preproc biltrans [input [output]]" << endl; + cerr << " " << basename(progname) << " -t trules preproc biltrans [input [output]]" << endl; + cerr << " trules transfer rules file" << endl; + cerr << " preproc result of preprocess trules file" << endl; + cerr << " biltrans bilingual letter transducer file" << endl; + cerr << " input input file, standard input by default" << endl; + cerr << " output output file, standard output by default" << endl; + cerr << " -b input from lexical transfer" << endl; + cerr << " -n don't use bilingual dictionary" << endl; + cerr << " -x bindix extended mode with user dictionary" << endl; + cerr << " -c case-sensitiveness while accessing bilingual dictionary" << endl; + cerr << " -t trace (show rule numbers and patterns matched)" << endl; + cerr << " -T trace, for apertium-transfer-tools (also sets -t)" << endl; + cerr << " -z null-flushing output on '\0'" << endl; + cerr << " -h shows this message" << endl; + + + exit(EXIT_FAILURE); +} + +void testfile(string const &filename) +{ + struct stat mybuf; + if(stat(filename.c_str(), &mybuf) == -1) + { + cerr << "Error: can't stat file '"; + cerr << filename << "'." << endl; + exit(EXIT_FAILURE); + } +} + +FILE * open_input(string const &filename) +{ + FILE *input = fopen(filename.c_str(), "r"); + if(!input) + { + cerr << "Error: can't open input file '"; + cerr << filename.c_str() << "'." << endl; + exit(EXIT_FAILURE); + } + + return input; +} + +FILE * open_output(string const &filename) +{ + FILE *output = fopen(filename.c_str(), "w"); + if(!output) + { + cerr << "Error: can't open output file '"; + cerr << filename.c_str() << "'." << endl; + exit(EXIT_FAILURE); + } + return output; +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + + Transfer t; + + int option_index=0; + + while (true) { + static struct option long_options[] = + { + {"from-bilingual", no_argument, 0, 'b'}, + {"no-bilingual", no_argument, 0, 'n'}, + {"extended", required_argument, 0, 'x'}, + {"case-sensitive", no_argument, 0, 'c'}, + {"null-flush", no_argument, 0, 'z'}, + {"trace", no_argument, 0, 't'}, + {"trace_att", no_argument, 0, 'T'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + + int c=getopt_long(argc, argv, "nbx:cztTh", long_options, &option_index); + if (c==-1) + break; + + switch (c) + { + case 'b': + t.setPreBilingual(true); + t.setUseBilingual(false); + break; + + case 'n': + t.setUseBilingual(false); + break; + + case 'x': + t.setExtendedDictionary(optarg); + break; + + case 'c': + t.setCaseSensitiveness(true); + break; + + case 't': + t.setTrace(true); + break; + + case 'T': + t.setTrace(true); + t.setTraceATT(true); + break; + + case 'z': + t.setNullFlush(true); + break; + + case 'h': + default: + message(argv[0]); + break; + } + } + + FILE *input = stdin, *output = stdout; + + switch(argc - optind + 1) + { + case 6: + output = open_output(argv[argc-1]); + input = open_input(argv[argc-2]); + testfile(argv[argc-3]); + testfile(argv[argc-4]); + testfile(argv[argc-5]); + t.read(argv[argc-5], argv[argc-4], argv[argc-3]); + break; + + case 5: + if(t.getUseBilingual() == false || t.getPreBilingual() == true) + { + output = open_output(argv[argc-1]); + input = open_input(argv[argc-2]); + testfile(argv[argc-3]); + testfile(argv[argc-4]); + t.read(argv[argc-4], argv[argc-3]); + } + else + { + input = open_input(argv[argc-1]); + testfile(argv[argc-2]); + testfile(argv[argc-3]); + testfile(argv[argc-4]); + t.read(argv[argc-4], argv[argc-3], argv[argc-2]); + } + break; + + case 4: + if(t.getUseBilingual() == false || t.getPreBilingual() == true) + { + input = open_input(argv[argc-1]); + testfile(argv[argc-2]); + testfile(argv[argc-3]); + t.read(argv[argc-3], argv[argc-2]); + } + else + { + testfile(argv[argc-1]); + testfile(argv[argc-2]); + testfile(argv[argc-3]); + t.read(argv[argc-3], argv[argc-2], argv[argc-1]); + } + break; + case 3: + if(t.getUseBilingual() == false || t.getPreBilingual() == true) + { + testfile(argv[argc-1]); + testfile(argv[argc-2]); + t.read(argv[argc-2], argv[argc-1]); + } + else + { + message(argv[0]); + } + break; + + default: + message(argv[0]); + break; + } + +#ifdef _MSC_VER + _setmode(_fileno(input), _O_U8TEXT); + _setmode(_fileno(output), _O_U8TEXT); +#endif + + t.transfer(input, output); + return EXIT_SUCCESS; +} Index: branches/apertium-tagger/apertium2/apertium/getopt_long.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/getopt_long.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/getopt_long.c (revision 69632) @@ -0,0 +1,1236 @@ +/* + * THIS IS NOT A CLEAN COPY OF GETOPT.C AND GETOPT1.C + * + * Implementation of getopt_long, cobbled together from getopt.c and + * getopt1.c from the GNU binutils distribution. This is more-or-less + * getopt.c inserted into getopt1.c, with the definition of getopt() + * commented out. + * + * Need to ifdef out optarg, optind, opterr, optopt, to handle the + * case where these are already defined for the benefit of system + * getopt() + * + * No, it's not pretty. + */ + +/* getopt_long and getopt_long_only entry points for GNU getopt. + Copyright (C) 1987,88,89,90,91,92,93,94,96,97,98 + Free Software Foundation, Inc. + + NOTE: This source is derived from an old version taken from the GNU C + Library (glibc). + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + USA. */ + +#include + +#ifndef HAVE_GETOPT_LONG +/* We shouldn't be compiling this module in this case, but we clearly + are (damned configuration tools!), so avoid messing up. */ + +#include "getopt_long.h" +/* See getopt_long.h for discussion of THIS_IS__STDC__ */ + + +#if !defined THIS_IS__STDC__ || !THIS_IS__STDC__ +/* This is a separate conditional since some stdc systems + reject `defined (const)'. */ +#ifndef const +#define const +#endif +#endif + +#include + + + +/* ******************** getopt.c ******************** */ +/* Getopt for GNU. + NOTE: getopt is now part of the C library, so if you don't know what + "Keep this file name-space clean" means, talk to drepper@gnu.org + before changing it! + + Copyright (C) 1987, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98 + Free Software Foundation, Inc. + + NOTE: This source is derived from an old version taken from the GNU C + Library (glibc). + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + USA. */ + +/* This tells Alpha OSF/1 not to define a getopt prototype in . + Ditto for AIX 3.2 and . */ +#ifndef _NO_PROTO +# define _NO_PROTO +#endif + + +#if !defined THIS_IS__STDC__ || !THIS_IS__STDC__ +/* This is a separate conditional since some stdc systems + reject `defined (const)'. */ +# ifndef const +# define const +# endif +#endif + +#include + +/* Comment out all this code if we are using the GNU C Library, and are not + actually compiling the library itself. This code is part of the GNU C + Library, but also included in many other GNU distributions. Compiling + and linking in this code is a waste when using the GNU C library + (especially if it is a shared library). Rather than having every GNU + program understand `configure --with-gnu-libc' and omit the object files, + it is simpler to just do this in the source for each such file. */ + +#define GETOPT_INTERFACE_VERSION 2 +#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 +# include +# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION +# define ELIDE_CODE +# endif +#endif + +#ifndef ELIDE_CODE + + +/* This needs to come after some library #include + to get __GNU_LIBRARY__ defined. */ +#ifdef __GNU_LIBRARY__ +/* Don't include stdlib.h for non-GNU C libraries because some of them + contain conflicting prototypes for getopt. */ +# include +# include +#endif /* GNU C library. */ + +#ifdef VMS +# include +# if HAVE_STRING_H - 0 +# include +# endif +#endif + +#ifndef _ +/* This is for other GNU distributions with internationalized messages. + When compiling libc, the _ macro is predefined. */ +# ifdef HAVE_LIBINTL_H +# include +# define _(msgid) gettext (msgid) +# else +# define _(msgid) (msgid) +# endif +#endif + +/* This version of `getopt' appears to the caller like standard Unix `getopt' + but it behaves differently for the user, since it allows the user + to intersperse the options with the other arguments. + + As `getopt' works, it permutes the elements of ARGV so that, + when it is done, all the options precede everything else. Thus + all application programs are extended to handle flexible argument order. + + Setting the environment variable POSIXLY_CORRECT disables permutation. + Then the behavior is completely standard. + + GNU application programs can use a third alternative mode in which + they can distinguish the relative order of options and other arguments. */ + + + +/* Define HAVE_GETOPT if the getopt function (and thus, which is more + * important to us, the getopt globals, optarg, optind, opterr and + * optopt) is defined by the system. Leave undefined if they should be + * defined here instead. + */ +#ifndef HAVE_GETOPT + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +char *optarg = NULL; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +/* 1003.2 says this must be 1 before any call. */ +int optind = 1; + +/* Callers store zero here to inhibit the error message + for unrecognized options. */ + +int opterr = 1; + +/* Set to an option character which was unrecognized. + This must be initialized on some systems to avoid linking in the + system's own getopt implementation. */ + +int optopt = '?'; + +#endif /* #ifndef HAVE_GETOPT */ + +/* Formerly, initialization of getopt depended on optind==0, which + causes problems with re-calling getopt as programs generally don't + know that. */ + +int __getopt_initialized = 0; + +/* The next char to be scanned in the option-element + in which the last option character we returned was found. + This allows us to pick up the scan where we left off. + + If this is zero, or a null string, it means resume the scan + by advancing to the next ARGV-element. */ + +static char *nextchar; + +/* Describe how to deal with options that follow non-option ARGV-elements. + + If the caller did not specify anything, + the default is REQUIRE_ORDER if the environment variable + POSIXLY_CORRECT is defined, PERMUTE otherwise. + + REQUIRE_ORDER means don't recognize them as options; + stop option processing when the first non-option is seen. + This is what Unix does. + This mode of operation is selected by either setting the environment + variable POSIXLY_CORRECT, or using `+' as the first character + of the list of option characters. + + PERMUTE is the default. We permute the contents of ARGV as we scan, + so that eventually all the non-options are at the end. This allows options + to be given in any order, even with programs that were not written to + expect this. + + RETURN_IN_ORDER is an option available to programs that were written + to expect options and other ARGV-elements in any order and that care about + the ordering of the two. We describe each non-option ARGV-element + as if it were the argument of an option with character code 1. + Using `-' as the first character of the list of option characters + selects this mode of operation. + + The special argument `--' forces an end of option-scanning regardless + of the value of `ordering'. In the case of RETURN_IN_ORDER, only + `--' can cause `getopt' to return -1 with `optind' != ARGC. */ + +static enum +{ + REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER +} ordering; + +/* Value of POSIXLY_CORRECT environment variable. */ +static char *posixly_correct; + +#ifdef __GNU_LIBRARY__ +/* We want to avoid inclusion of string.h with non-GNU libraries + because there are many ways it can cause trouble. + On some systems, it contains special magic macros that don't work + in GCC. */ +# include +# define my_index strchr +#else + +# if HAVE_STRING_H +# include +# else +# if HAVE_STRINGS_H +# include +# endif +# endif + +/* Avoid depending on library functions or files + whose names are inconsistent. */ + +#ifndef getenv +extern char *getenv (); +#endif + +static char * +my_index (str, chr) + const char *str; + int chr; +{ + while (*str) + { + if (*str == chr) + return (char *) str; + str++; + } + return 0; +} + +/* If using GCC, we can safely declare strlen this way. + If not using GCC, it is ok not to declare it. */ +#ifdef __GNUC__ +/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. + That was relevant to code that was here before. */ +# if (!defined THIS_IS__STDC__ || !THIS_IS__STDC__) && !defined strlen +/* gcc with -traditional declares the built-in strlen to return int, + and has done so at least since version 2.4.5. -- rms. */ +extern int strlen (const char *); +# endif /* not THIS_IS__STDC__ */ +#endif /* __GNUC__ */ + +#endif /* not __GNU_LIBRARY__ */ + +/* Handle permutation of arguments. */ + +/* Describe the part of ARGV that contains non-options that have + been skipped. `first_nonopt' is the index in ARGV of the first of them; + `last_nonopt' is the index after the last of them. */ + +static int first_nonopt; +static int last_nonopt; + +#ifdef _LIBC +/* Bash 2.0 gives us an environment variable containing flags + indicating ARGV elements that should not be considered arguments. */ + +/* Defined in getopt_init.c */ +extern char *__getopt_nonoption_flags; + +static int nonoption_flags_max_len; +static int nonoption_flags_len; + +static int original_argc; +static char *const *original_argv; + +/* Make sure the environment variable bash 2.0 puts in the environment + is valid for the getopt call we must make sure that the ARGV passed + to getopt is that one passed to the process. */ +static void +__attribute__ ((unused)) +store_args_and_env (int argc, char *const *argv) +{ + /* XXX This is no good solution. We should rather copy the args so + that we can compare them later. But we must not use malloc(3). */ + original_argc = argc; + original_argv = argv; +} +# ifdef text_set_element +text_set_element (__libc_subinit, store_args_and_env); +# endif /* text_set_element */ + +# define SWAP_FLAGS(ch1, ch2) \ + if (nonoption_flags_len > 0) \ + { \ + char __tmp = __getopt_nonoption_flags[ch1]; \ + __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ + __getopt_nonoption_flags[ch2] = __tmp; \ + } +#else /* !_LIBC */ +# define SWAP_FLAGS(ch1, ch2) +#endif /* _LIBC */ + +/* Exchange two adjacent subsequences of ARGV. + One subsequence is elements [first_nonopt,last_nonopt) + which contains all the non-options that have been skipped so far. + The other is elements [last_nonopt,optind), which contains all + the options processed since those non-options were skipped. + + `first_nonopt' and `last_nonopt' are relocated so that they describe + the new indices of the non-options in ARGV after they are moved. */ + +#if defined THIS_IS__STDC__ && THIS_IS__STDC__ +static void exchange (char **); +#endif + +static void +exchange (argv) + char **argv; +{ + int bottom = first_nonopt; + int middle = last_nonopt; + int top = optind; + char *tem; + + /* Exchange the shorter segment with the far end of the longer segment. + That puts the shorter segment into the right place. + It leaves the longer segment in the right place overall, + but it consists of two parts that need to be swapped next. */ + +#ifdef _LIBC + /* First make sure the handling of the `__getopt_nonoption_flags' + string can work normally. Our top argument must be in the range + of the string. */ + if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) + { + /* We must extend the array. The user plays games with us and + presents new arguments. */ + char *new_str = malloc (top + 1); + if (new_str == NULL) + nonoption_flags_len = nonoption_flags_max_len = 0; + else + { + memset (__mempcpy (new_str, __getopt_nonoption_flags, + nonoption_flags_max_len), + '\0', top + 1 - nonoption_flags_max_len); + nonoption_flags_max_len = top + 1; + __getopt_nonoption_flags = new_str; + } + } +#endif + + while (top > middle && middle > bottom) + { + if (top - middle > middle - bottom) + { + /* Bottom segment is the short one. */ + int len = middle - bottom; + register int i; + + /* Swap it with the top part of the top segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[top - (middle - bottom) + i]; + argv[top - (middle - bottom) + i] = tem; + SWAP_FLAGS (bottom + i, top - (middle - bottom) + i); + } + /* Exclude the moved bottom segment from further swapping. */ + top -= len; + } + else + { + /* Top segment is the short one. */ + int len = top - middle; + register int i; + + /* Swap it with the bottom part of the bottom segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[middle + i]; + argv[middle + i] = tem; + SWAP_FLAGS (bottom + i, middle + i); + } + /* Exclude the moved top segment from further swapping. */ + bottom += len; + } + } + + /* Update records for the slots the non-options now occupy. */ + + first_nonopt += (optind - last_nonopt); + last_nonopt = optind; +} + +/* Initialize the internal data when the first call is made. */ + +#if defined THIS_IS__STDC__ && THIS_IS__STDC__ +static const char *_getopt_initialize (int, char *const *, const char *); +#endif +static const char * +_getopt_initialize (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + /* Start processing options with ARGV-element 1 (since ARGV-element 0 + is the program name); the sequence of previously skipped + non-option ARGV-elements is empty. */ + + first_nonopt = last_nonopt = optind; + + nextchar = NULL; + + posixly_correct = getenv ("POSIXLY_CORRECT"); + + /* Determine how to handle the ordering of options and nonoptions. */ + + if (optstring[0] == '-') + { + ordering = RETURN_IN_ORDER; + ++optstring; + } + else if (optstring[0] == '+') + { + ordering = REQUIRE_ORDER; + ++optstring; + } + else if (posixly_correct != NULL) + ordering = REQUIRE_ORDER; + else + ordering = PERMUTE; + +#ifdef _LIBC + if (posixly_correct == NULL + && argc == original_argc && argv == original_argv) + { + if (nonoption_flags_max_len == 0) + { + if (__getopt_nonoption_flags == NULL + || __getopt_nonoption_flags[0] == '\0') + nonoption_flags_max_len = -1; + else + { + const char *orig_str = __getopt_nonoption_flags; + int len = nonoption_flags_max_len = strlen (orig_str); + if (nonoption_flags_max_len < argc) + nonoption_flags_max_len = argc; + __getopt_nonoption_flags = + (char *) malloc (nonoption_flags_max_len); + if (__getopt_nonoption_flags == NULL) + nonoption_flags_max_len = -1; + else + memset (__mempcpy (__getopt_nonoption_flags, orig_str, len), + '\0', nonoption_flags_max_len - len); + } + } + nonoption_flags_len = nonoption_flags_max_len; + } + else + nonoption_flags_len = 0; +#endif + + return optstring; +} + +/* Scan elements of ARGV (whose length is ARGC) for option characters + given in OPTSTRING. + + If an element of ARGV starts with '-', and is not exactly "-" or "--", + then it is an option element. The characters of this element + (aside from the initial '-') are option characters. If `getopt' + is called repeatedly, it returns successively each of the option characters + from each of the option elements. + + If `getopt' finds another option character, it returns that character, + updating `optind' and `nextchar' so that the next call to `getopt' can + resume the scan with the following option character or ARGV-element. + + If there are no more option characters, `getopt' returns -1. + Then `optind' is the index in ARGV of the first ARGV-element + that is not an option. (The ARGV-elements have been permuted + so that those that are not options now come last.) + + OPTSTRING is a string containing the legitimate option characters. + If an option character is seen that is not listed in OPTSTRING, + return '?' after printing an error message. If you set `opterr' to + zero, the error message is suppressed but we still return '?'. + + If a char in OPTSTRING is followed by a colon, that means it wants an arg, + so the following text in the same ARGV-element, or the text of the following + ARGV-element, is returned in `optarg'. Two colons mean an option that + wants an optional arg; if there is text in the current ARGV-element, + it is returned in `optarg', otherwise `optarg' is set to zero. + + If OPTSTRING starts with `-' or `+', it requests different methods of + handling the non-option ARGV-elements. + See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. + + Long-named options begin with `--' instead of `-'. + Their names may be abbreviated as long as the abbreviation is unique + or is an exact match for some defined option. If they have an + argument, it follows the option name in the same ARGV-element, separated + from the option name by a `=', or else the in next ARGV-element. + When `getopt' finds a long-named option, it returns 0 if that option's + `flag' field is nonzero, the value of the option's `val' field + if the `flag' field is zero. + + The elements of ARGV aren't really const, because we permute them. + But we pretend they're const in the prototype to be compatible + with other systems. + + LONGOPTS is a vector of `struct option' terminated by an + element containing a name which is zero. + + LONGIND returns the index in LONGOPT of the long-named option found. + It is only valid when a long-named option has been found by the most + recent call. + + If LONG_ONLY is nonzero, '-' as well as '--' can introduce + long-named options. */ + +#if 0 +int +_getopt_internal (argc, argv, optstring, longopts, longind, long_only) + int argc; + char *const *argv; + const char *optstring; + const struct option *longopts; + int *longind; + int long_only; +#endif +int +_getopt_internal (int argc, + char *const *argv, + const char *optstring, + const struct option *longopts, + int *longind, + int long_only) +{ + optarg = NULL; + + if (optind == 0 || !__getopt_initialized) + { + if (optind == 0) + optind = 1; /* Don't scan ARGV[0], the program name. */ + optstring = _getopt_initialize (argc, argv, optstring); + __getopt_initialized = 1; + } + + /* Test whether ARGV[optind] points to a non-option argument. + Either it does not have option syntax, or there is an environment flag + from the shell indicating it is not an option. The later information + is only used when the used in the GNU libc. */ +#ifdef _LIBC +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ + || (optind < nonoption_flags_len \ + && __getopt_nonoption_flags[optind] == '1')) +#else +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') +#endif + + if (nextchar == NULL || *nextchar == '\0') + { + /* Advance to the next ARGV-element. */ + + /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been + moved back by the user (who may also have changed the arguments). */ + if (last_nonopt > optind) + last_nonopt = optind; + if (first_nonopt > optind) + first_nonopt = optind; + + if (ordering == PERMUTE) + { + /* If we have just processed some options following some non-options, + exchange them so that the options come first. */ + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (last_nonopt != optind) + first_nonopt = optind; + + /* Skip any additional non-options + and extend the range of non-options previously skipped. */ + + while (optind < argc && NONOPTION_P) + optind++; + last_nonopt = optind; + } + + /* The special ARGV-element `--' means premature end of options. + Skip it like a null option, + then exchange with previous non-options as if it were an option, + then skip everything else like a non-option. */ + + if (optind != argc && !strcmp (argv[optind], "--")) + { + optind++; + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (first_nonopt == last_nonopt) + first_nonopt = optind; + last_nonopt = argc; + + optind = argc; + } + + /* If we have done all the ARGV-elements, stop the scan + and back over any non-options that we skipped and permuted. */ + + if (optind == argc) + { + /* Set the next-arg-index to point at the non-options + that we previously skipped, so the caller will digest them. */ + if (first_nonopt != last_nonopt) + optind = first_nonopt; + return -1; + } + + /* If we have come to a non-option and did not permute it, + either stop the scan or describe it to the caller and pass it by. */ + + if (NONOPTION_P) + { + if (ordering == REQUIRE_ORDER) + return -1; + optarg = argv[optind++]; + return 1; + } + + /* We have found another option-ARGV-element. + Skip the initial punctuation. */ + + nextchar = (argv[optind] + 1 + + (longopts != NULL && argv[optind][1] == '-')); + } + + /* Decode the current option-ARGV-element. */ + + /* Check whether the ARGV-element is a long option. + + If long_only and the ARGV-element has the form "-f", where f is + a valid short option, don't consider it an abbreviated form of + a long option that starts with f. Otherwise there would be no + way to give the -f short option. + + On the other hand, if there's a long option "fubar" and + the ARGV-element is "-fu", do consider that an abbreviation of + the long option, just like "--fu", and not "-f" with arg "u". + + This distinction seems to be the most useful approach. */ + + if (longopts != NULL + && (argv[optind][1] == '-' + || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1]))))) + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = -1; + int option_index; + + for (nameend = nextchar; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) + == (unsigned int) strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + + if (ambig && !exact) + { + if (opterr) + fprintf (stderr, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]); + nextchar += strlen (nextchar); + optind++; + optopt = 0; + return '?'; + } + + if (pfound != NULL) + { + option_index = indfound; + optind++; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (opterr) + { + if (argv[optind - 1][1] == '-') + /* --option */ + fprintf (stderr, + _("%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); + else + /* +option or -option */ + fprintf (stderr, + _("%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], pfound->name); + + nextchar += strlen (nextchar); + + optopt = pfound->val; + return '?'; + } + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (opterr) + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); + nextchar += strlen (nextchar); + optopt = pfound->val; + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + + /* Can't find it as a long option. If this is not getopt_long_only, + or the option starts with '--' or is not a valid short + option, then it's an error. + Otherwise interpret it as a short option. */ + if (!long_only || argv[optind][1] == '-' + || my_index (optstring, *nextchar) == NULL) + { + if (opterr) + { + if (argv[optind][1] == '-') + /* --option */ + fprintf (stderr, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); + else + /* +option or -option */ + fprintf (stderr, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); + } + nextchar = (char *) ""; + optind++; + optopt = 0; + return '?'; + } + } + + /* Look at and handle the next short option-character. */ + + { + char c = *nextchar++; + char *temp = my_index (optstring, c); + + /* Increment `optind' when we start to process its last character. */ + if (*nextchar == '\0') + ++optind; + + if (temp == NULL || c == ':') + { + if (opterr) + { + if (posixly_correct) + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, _("%s: illegal option -- %c\n"), + argv[0], c); + else + fprintf (stderr, _("%s: invalid option -- %c\n"), + argv[0], c); + } + optopt = c; + return '?'; + } + /* Convenience. Treat POSIX -W foo same as long option --foo */ + if (temp[0] == 'W' && temp[1] == ';') + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = 0; + int option_index; + + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (opterr) + { + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, _("%s: option requires an argument -- %c\n"), + argv[0], c); + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + return c; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + + /* optarg is now the argument, see if it's in the + table of longopts. */ + + for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) == strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + if (ambig && !exact) + { + if (opterr) + fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]); + nextchar += strlen (nextchar); + optind++; + return '?'; + } + if (pfound != NULL) + { + option_index = indfound; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (opterr) + fprintf (stderr, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name); + + nextchar += strlen (nextchar); + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (opterr) + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); + nextchar += strlen (nextchar); + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + nextchar = NULL; + return 'W'; /* Let the application handle it. */ + } + if (temp[1] == ':') + { + if (temp[2] == ':') + { + /* This is an option that accepts an argument optionally. */ + if (*nextchar != '\0') + { + optarg = nextchar; + optind++; + } + else + optarg = NULL; + nextchar = NULL; + } + else + { + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (opterr) + { + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, + _("%s: option requires an argument -- %c\n"), + argv[0], c); + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + nextchar = NULL; + } + } + return c; + } +} + +/* +int +getopt (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + return _getopt_internal (argc, argv, optstring, + (const struct option *) 0, + (int *) 0, + 0); +} +*/ + +#endif /* Not ELIDE_CODE. */ +/* ******************** ...getopt.c ******************** */ + + + +/* Comment out all this code if we are using the GNU C Library, and are not + actually compiling the library itself. This code is part of the GNU C + Library, but also included in many other GNU distributions. Compiling + and linking in this code is a waste when using the GNU C library + (especially if it is a shared library). Rather than having every GNU + program understand `configure --with-gnu-libc' and omit the object files, + it is simpler to just do this in the source for each such file. */ + +#define GETOPT_INTERFACE_VERSION 2 +#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 +#include +#if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION +#define ELIDE_CODE +#endif +#endif + +#ifndef ELIDE_CODE + + +/* This needs to come after some library #include + to get __GNU_LIBRARY__ defined. */ +#ifdef __GNU_LIBRARY__ +#include +#endif + +#ifndef NULL +#define NULL 0 +#endif + +/* K&R declarations!? C'mon... */ +/* Just say no to all this gymnastics */ +#if 0 +int +getopt_long (argc, argv, options, long_options, opt_index) + int argc; + char *const *argv; + const char *options; + const struct option *long_options; + int *opt_index; +#endif +int getopt_long (int argc, + char *const *argv, + const char *options, + const struct option *long_options, + int *opt_index) +{ + return _getopt_internal (argc, argv, options, long_options, opt_index, 0); +} + +/* Like getopt_long, but '-' as well as '--' can indicate a long option. + If an option that starts with '-' (not '--') doesn't match a long option, + but does match a short option, it is parsed as a short option + instead. */ + +#if 0 +int +getopt_long_only (argc, argv, options, long_options, opt_index) + int argc; + char *const *argv; + const char *options; + const struct option *long_options; + int *opt_index; +#endif +int +getopt_long_only (int argc, + char *const *argv, + const char *options, + const struct option *long_options, + int *opt_index) +{ + return _getopt_internal (argc, argv, options, long_options, opt_index, 1); +} + + +#endif /* Not ELIDE_CODE. */ + +#ifdef TEST + +#include + +int +main (argc, argv) + int argc; + char **argv; +{ + int c; + int digit_optind = 0; + + while (1) + { + int this_option_optind = optind ? optind : 1; + int option_index = 0; + static struct option long_options[] = + { + {"add", 1, 0, 0}, + {"append", 0, 0, 0}, + {"delete", 1, 0, 0}, + {"verbose", 0, 0, 0}, + {"create", 0, 0, 0}, + {"file", 1, 0, 0}, + {0, 0, 0, 0} + }; + + c = getopt_long (argc, argv, "abc:d:0123456789", + long_options, &option_index); + if (c == -1) + break; + + switch (c) + { + case 0: + printf ("option %s", long_options[option_index].name); + if (optarg) + printf (" with arg %s", optarg); + printf ("\n"); + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (digit_optind != 0 && digit_optind != this_option_optind) + printf ("digits occur in two different argv-elements.\n"); + digit_optind = this_option_optind; + printf ("option %c\n", c); + break; + + case 'a': + printf ("option a\n"); + break; + + case 'b': + printf ("option b\n"); + break; + + case 'c': + printf ("option c with value `%s'\n", optarg); + break; + + case 'd': + printf ("option d with value `%s'\n", optarg); + break; + + case '?': + break; + + default: + printf ("?? getopt returned character code 0%o ??\n", c); + } + } + + if (optind < argc) + { + printf ("non-option ARGV-elements: "); + while (optind < argc) + printf ("%s ", argv[optind++]); + printf ("\n"); + } + + exit (0); +} + +#endif /* TEST */ + +#endif /* #ifndef HAVE_GETOPT_LONG */ Index: branches/apertium-tagger/apertium2/apertium/getopt_long.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/getopt_long.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/getopt_long.h (revision 69632) @@ -0,0 +1,175 @@ +/* Declarations for getopt. + Copyright 1989, 1990, 1991, 1992, 1993, 1994, 1996, 1997, 1998, 2000 + Free Software Foundation, Inc. + + NOTE: The canonical source of this file is maintained with the GNU C Library. + Bugs can be reported to bug-glibc@gnu.org. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + USA. */ + +#ifndef _GETOPT_LONG_H +#define _GETOPT_LONG_H 1 + +#include + +#if HAVE_UNISTD_H +/* Declares getopt, if present */ +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* We're building this with a C++ compiler, essentially. Such + compilers are not required to define __STDC__, but the path we + should follow, below, is indeed that marked by __STDC__. We don't + want to force a definition of __STDC__ (though that works), because + (a) that feels bad, and (b) some compilers perfectly reasonable + complain bitterly about it. So define THIS_IS__STDC__, and replace + occurrences of __STDC__ throughout with that. + + That means that all of the occurrences of THIS_IS__STDC__ in this + file and in getopt_long.c are redundant, but I'm leaving them here + in case it becomes necessary to do cleverer things with it than + simply define it to be 1, and also as a sort of warped documentation. */ +#define THIS_IS__STDC__ 1 + +#if !HAVE_DECL_GETOPT +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +extern char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +extern int optind; + +/* Callers store zero here to inhibit the error message `getopt' prints + for unrecognized options. */ + +extern int opterr; + +/* Set to an option character which was unrecognized. */ + +extern int optopt; + +#endif /* ifndef HAVE_DECL_GETOPT */ + +#if !HAVE_DECL_GETOPT_LONG +/* Describe the long-named options requested by the application. + The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector + of `struct option' terminated by an element containing a name which is + zero. + + The field `has_arg' is: + no_argument (or 0) if the option does not take an argument, + required_argument (or 1) if the option requires an argument, + optional_argument (or 2) if the option takes an optional argument. + + If the field `flag' is not NULL, it points to a variable that is set + to the value given in the field `val' when the option is found, but + left unchanged if the option is not found. + + To have a long-named option do something other than set an `int' to + a compiled-in constant, such as set a value from `optarg', set the + option's `flag' field to zero and its `val' field to a nonzero + value (the equivalent single-letter option character, if there is + one). For long options that have a zero `flag' field, `getopt' + returns the contents of the `val' field. */ + +struct option +{ +#if defined (THIS_IS__STDC__) && THIS_IS__STDC__ + const char *name; +#else + char *name; +#endif + /* has_arg can't be an enum because some compilers complain about + type mismatches in all the code that assumes it is an int. */ + int has_arg; + int *flag; + int val; +}; + +/* Names for the values of the `has_arg' field of `struct option'. */ + +#define no_argument 0 +#define required_argument 1 +#define optional_argument 2 + +#endif /* #if !HAVE_DECL_GETOPT_LONG */ + +#if defined (THIS_IS__STDC__) && THIS_IS__STDC__ +/* HAVE_DECL_* is a three-state macro: undefined, 0 or 1. If it is + undefined, we haven't run the autoconf check so provide the + declaration without arguments. If it is 0, we checked and failed + to find the declaration so provide a fully prototyped one. If it + is 1, we found it so don't provide any declaration at all. */ +#if defined (__GNU_LIBRARY__) || (defined (HAVE_DECL_GETOPT) && !HAVE_DECL_GETOPT) +/* Many other libraries have conflicting prototypes for getopt, with + differences in the consts, in stdlib.h. To avoid compilation + errors, only prototype getopt for the GNU C library. */ +extern int getopt (int argc, char *const *argv, const char *shortopts); +#else /* not __GNU_LIBRARY__ */ +# if !defined (HAVE_DECL_GETOPT) +extern int getopt (); +# endif +#endif /* __GNU_LIBRARY__ */ +#if !HAVE_DECL_GETOPT_LONG +extern int getopt_long (int argc, char *const *argv, const char *shortopts, + const struct option *longopts, int *longind); +extern int getopt_long_only (int argc, char *const *argv, + const char *shortopts, + const struct option *longopts, int *longind); + +/* Internal only. Users should not call this directly. */ +extern int _getopt_internal (int argc, char *const *argv, + const char *shortopts, + const struct option *longopts, int *longind, + int long_only); +#endif /* HAVE_DECL_GETOPT_LONG */ +#else /* not THIS_IS__STDC__ */ +#if !HAVE_DECL_GETOPT +extern int getopt (); +#endif /* HAVE_DECL_GETOPT */ +#if !HAVE_DECL_GETOPT_LONG +extern int getopt_long (); +extern int getopt_long_only (); + +extern int _getopt_internal (); +#endif /* HAVE_DECL_GETOPT_LONG */ +#endif /* THIS_IS__STDC__ */ + + +#ifdef __cplusplus +} +#endif + +#endif /* getopt.h */ Index: branches/apertium-tagger/apertium2/apertium/win32/unistd.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/unistd.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/unistd.h (revision 69632) @@ -0,0 +1,13 @@ +// This should really be defined elsewhere +#define YY_INPUT(buf,result,max_size) \ + if ( (result = fread( (char *) buf, 1, max_size, yyin )) < 0 ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); + +#define fileno _fileno + +#if defined(_WIN32) && defined(isatty) +#undef isatty +#define isatty _isatty +#endif + +#define unlink _unlink Index: branches/apertium-tagger/apertium2/apertium/win32/snprintf.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/snprintf.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/snprintf.c (revision 69632) @@ -0,0 +1,1025 @@ +/* + * snprintf.c - a portable implementation of snprintf + * + * AUTHOR + * Mark Martinec , April 1999. + * + * Copyright 1999, Mark Martinec. All rights reserved. + * + * TERMS AND CONDITIONS + * This program is free software; you can redistribute it and/or modify + * it under the terms of the "Frontier Artistic License" which comes + * with this Kit. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the Frontier Artistic License for more details. + * + * You should have received a copy of the Frontier Artistic License + * with this Kit in the file named LICENSE.txt . + * If not, I'll be glad to provide one. + * + * FEATURES + * - careful adherence to specs regarding flags, field width and precision; + * - good performance for large string handling (large format, large + * argument or large paddings). Performance is similar to system's sprintf + * and in several cases significantly better (make sure you compile with + * optimizations turned on, tell the compiler the code is strict ANSI + * if necessary to give it more freedom for optimizations); + * - return value semantics per ISO/IEC 9899:1999 ("ISO C99"); + * - written in standard ISO/ANSI C - requires an ANSI C compiler. + * + * SUPPORTED CONVERSION SPECIFIERS AND DATA TYPES + * + * This snprintf only supports the following conversion specifiers: + * s, c, d, u, o, x, X, p (and synonyms: i, D, U, O - see below) + * with flags: '-', '+', ' ', '0' and '#'. + * An asterisk is supported for field width as well as precision. + * + * Length modifiers 'h' (short int), 'l' (long int), + * and 'll' (long long int) are supported. + * NOTE: + * If macro SNPRINTF_LONGLONG_SUPPORT is not defined (default) the + * length modifier 'll' is recognized but treated the same as 'l', + * which may cause argument value truncation! Defining + * SNPRINTF_LONGLONG_SUPPORT requires that your system's sprintf also + * handles length modifier 'll'. long long int is a language extension + * which may not be portable. + * + * Conversion of numeric data (conversion specifiers d, u, o, x, X, p) + * with length modifiers (none or h, l, ll) is left to the system routine + * sprintf, but all handling of flags, field width and precision as well as + * c and s conversions is done very carefully by this portable routine. + * If a string precision (truncation) is specified (e.g. %.8s) it is + * guaranteed the string beyond the specified precision will not be referenced. + * + * Length modifiers h, l and ll are ignored for c and s conversions (data + * types wint_t and wchar_t are not supported). + * + * The following common synonyms for conversion characters are supported: + * - i is a synonym for d + * - D is a synonym for ld, explicit length modifiers are ignored + * - U is a synonym for lu, explicit length modifiers are ignored + * - O is a synonym for lo, explicit length modifiers are ignored + * The D, O and U conversion characters are nonstandard, they are supported + * for backward compatibility only, and should not be used for new code. + * + * The following is specifically NOT supported: + * - flag ' (thousands' grouping character) is recognized but ignored + * - numeric conversion specifiers: f, e, E, g, G and synonym F, + * as well as the new a and A conversion specifiers + * - length modifier 'L' (long double) and 'q' (quad - use 'll' instead) + * - wide character/string conversions: lc, ls, and nonstandard + * synonyms C and S + * - writeback of converted string length: conversion character n + * - the n$ specification for direct reference to n-th argument + * - locales + * + * It is permitted for str_m to be zero, and it is permitted to specify NULL + * pointer for resulting string argument if str_m is zero (as per ISO C99). + * + * The return value is the number of characters which would be generated + * for the given input, excluding the trailing null. If this value + * is greater or equal to str_m, not all characters from the result + * have been stored in str, output bytes beyond the (str_m-1) -th character + * are discarded. If str_m is greater than zero it is guaranteed + * the resulting string will be null-terminated. + * + * NOTE that this matches the ISO C99, OpenBSD, and GNU C library 2.1, + * but is different from some older and vendor implementations, + * and is also different from XPG, XSH5, SUSv2 specifications. + * For historical discussion on changes in the semantics and standards + * of snprintf see printf(3) man page in the Linux programmers manual. + * + * Routines asprintf and vasprintf return a pointer (in the ptr argument) + * to a buffer sufficiently large to hold the resulting string. This pointer + * should be passed to free(3) to release the allocated storage when it is + * no longer needed. If sufficient space cannot be allocated, these functions + * will return -1 and set ptr to be a NULL pointer. These two routines are a + * GNU C library extensions (glibc). + * + * Routines asnprintf and vasnprintf are similar to asprintf and vasprintf, + * yet, like snprintf and vsnprintf counterparts, will write at most str_m-1 + * characters into the allocated output string, the last character in the + * allocated buffer then gets the terminating null. If the formatted string + * length (the return value) is greater than or equal to the str_m argument, + * the resulting string was truncated and some of the formatted characters + * were discarded. These routines present a handy way to limit the amount + * of allocated memory to some sane value. + * + * AVAILABILITY + * http://www.ijs.si/software/snprintf/ + * + * REVISION HISTORY + * 1999-04 V0.9 Mark Martinec + * - initial version, some modifications after comparing printf + * man pages for Digital Unix 4.0, Solaris 2.6 and HPUX 10, + * and checking how Perl handles sprintf (differently!); + * 1999-04-09 V1.0 Mark Martinec + * - added main test program, fixed remaining inconsistencies, + * added optional (long long int) support; + * 1999-04-12 V1.1 Mark Martinec + * - support the 'p' conversion (pointer to void); + * - if a string precision is specified + * make sure the string beyond the specified precision + * will not be referenced (e.g. by strlen); + * 1999-04-13 V1.2 Mark Martinec + * - support synonyms %D=%ld, %U=%lu, %O=%lo; + * - speed up the case of long format string with few conversions; + * 1999-06-30 V1.3 Mark Martinec + * - fixed runaway loop (eventually crashing when str_l wraps + * beyond 2^31) while copying format string without + * conversion specifiers to a buffer that is too short + * (thanks to Edwin Young for + * spotting the problem); + * - added macros PORTABLE_SNPRINTF_VERSION_(MAJOR|MINOR) + * to snprintf.h + * 2000-02-14 V2.0 (never released) Mark Martinec + * - relaxed license terms: The Artistic License now applies. + * You may still apply the GNU GENERAL PUBLIC LICENSE + * as was distributed with previous versions, if you prefer; + * - changed REVISION HISTORY dates to use ISO 8601 date format; + * - added vsnprintf (patch also independently proposed by + * Caolan McNamara 2000-05-04, and Keith M Willenson 2000-06-01) + * 2000-06-27 V2.1 Mark Martinec + * - removed POSIX check for str_m<1; value 0 for str_m is + * allowed by ISO C99 (and GNU C library 2.1) - (pointed out + * on 2000-05-04 by Caolan McNamara, caolan@ csn dot ul dot ie). + * Besides relaxed license this change in standards adherence + * is the main reason to bump up the major version number; + * - added nonstandard routines asnprintf, vasnprintf, asprintf, + * vasprintf that dynamically allocate storage for the + * resulting string; these routines are not compiled by default, + * see comments where NEED_V?ASN?PRINTF macros are defined; + * - autoconf contributed by Caolan McNamara + * 2000-10-06 V2.2 Mark Martinec + * - BUG FIX: the %c conversion used a temporary variable + * that was no longer in scope when referenced, + * possibly causing incorrect resulting character; + * - BUG FIX: make precision and minimal field width unsigned + * to handle huge values (2^31 <= n < 2^32) correctly; + * also be more careful in the use of signed/unsigned/size_t + * internal variables - probably more careful than many + * vendor implementations, but there may still be a case + * where huge values of str_m, precision or minimal field + * could cause incorrect behaviour; + * - use separate variables for signed/unsigned arguments, + * and for short/int, long, and long long argument lengths + * to avoid possible incompatibilities on certain + * computer architectures. Also use separate variable + * arg_sign to hold sign of a numeric argument, + * to make code more transparent; + * - some fiddling with zero padding and "0x" to make it + * Linux compatible; + * - systematically use macros fast_memcpy and fast_memset + * instead of case-by-case hand optimization; determine some + * breakeven string lengths for different architectures; + * - terminology change: 'format' -> 'conversion specifier', + * 'C9x' -> 'ISO/IEC 9899:1999 ("ISO C99")', + * 'alternative form' -> 'alternate form', + * 'data type modifier' -> 'length modifier'; + * - several comments rephrased and new ones added; + * - make compiler not complain about 'credits' defined but + * not used; + */ + + +/* Define HAVE_SNPRINTF if your system already has snprintf and vsnprintf. + * + * If HAVE_SNPRINTF is defined this module will not produce code for + * snprintf and vsnprintf, unless PREFER_PORTABLE_SNPRINTF is defined as well, + * causing this portable version of snprintf to be called portable_snprintf + * (and portable_vsnprintf). + */ +/* #define HAVE_SNPRINTF */ + +/* Define PREFER_PORTABLE_SNPRINTF if your system does have snprintf and + * vsnprintf but you would prefer to use the portable routine(s) instead. + * In this case the portable routine is declared as portable_snprintf + * (and portable_vsnprintf) and a macro 'snprintf' (and 'vsnprintf') + * is defined to expand to 'portable_v?snprintf' - see file snprintf.h . + * Defining this macro is only useful if HAVE_SNPRINTF is also defined, + * but does does no harm if defined nevertheless. + */ +/* #define PREFER_PORTABLE_SNPRINTF */ + +/* Define SNPRINTF_LONGLONG_SUPPORT if you want to support + * data type (long long int) and length modifier 'll' (e.g. %lld). + * If undefined, 'll' is recognized but treated as a single 'l'. + * + * If the system's sprintf does not handle 'll' + * the SNPRINTF_LONGLONG_SUPPORT must not be defined! + * + * This is off by default as (long long int) is a language extension. + */ +/* #define SNPRINTF_LONGLONG_SUPPORT */ + +/* Define NEED_SNPRINTF_ONLY if you only need snprintf, and not vsnprintf. + * If NEED_SNPRINTF_ONLY is defined, the snprintf will be defined directly, + * otherwise both snprintf and vsnprintf routines will be defined + * and snprintf will be a simple wrapper around vsnprintf, at the expense + * of an extra procedure call. + */ +/* #define NEED_SNPRINTF_ONLY */ + +/* Define NEED_V?ASN?PRINTF macros if you need library extension + * routines asprintf, vasprintf, asnprintf, vasnprintf respectively, + * and your system library does not provide them. They are all small + * wrapper routines around portable_vsnprintf. Defining any of the four + * NEED_V?ASN?PRINTF macros automatically turns off NEED_SNPRINTF_ONLY + * and turns on PREFER_PORTABLE_SNPRINTF. + * + * Watch for name conflicts with the system library if these routines + * are already present there. + * + * NOTE: vasprintf and vasnprintf routines need va_copy() from stdarg.h, as + * specified by C99, to be able to traverse the same list of arguments twice. + * I don't know of any other standard and portable way of achieving the same. + * With some versions of gcc you may use __va_copy(). You might even get away + * with "ap2 = ap", in this case you must not call va_end(ap2) ! + * #define va_copy(ap2,ap) ap2 = ap + */ +/* #define NEED_ASPRINTF */ +/* #define NEED_ASNPRINTF */ +/* #define NEED_VASPRINTF */ +/* #define NEED_VASNPRINTF */ + + +/* Define the following macros if desired: + * SOLARIS_COMPATIBLE, SOLARIS_BUG_COMPATIBLE, + * HPUX_COMPATIBLE, HPUX_BUG_COMPATIBLE, LINUX_COMPATIBLE, + * DIGITAL_UNIX_COMPATIBLE, DIGITAL_UNIX_BUG_COMPATIBLE, + * PERL_COMPATIBLE, PERL_BUG_COMPATIBLE, + * + * - For portable applications it is best not to rely on peculiarities + * of a given implementation so it may be best not to define any + * of the macros that select compatibility and to avoid features + * that vary among the systems. + * + * - Selecting compatibility with more than one operating system + * is not strictly forbidden but is not recommended. + * + * - 'x'_BUG_COMPATIBLE implies 'x'_COMPATIBLE . + * + * - 'x'_COMPATIBLE refers to (and enables) a behaviour that is + * documented in a sprintf man page on a given operating system + * and actually adhered to by the system's sprintf (but not on + * most other operating systems). It may also refer to and enable + * a behaviour that is declared 'undefined' or 'implementation specific' + * in the man page but a given implementation behaves predictably + * in a certain way. + * + * - 'x'_BUG_COMPATIBLE refers to (and enables) a behaviour of system's sprintf + * that contradicts the sprintf man page on the same operating system. + * + * - I do not claim that the 'x'_COMPATIBLE and 'x'_BUG_COMPATIBLE + * conditionals take into account all idiosyncrasies of a particular + * implementation, there may be other incompatibilities. + */ + + + +/* ============================================= */ +/* NO USER SERVICABLE PARTS FOLLOWING THIS POINT */ +/* ============================================= */ + +#define PORTABLE_SNPRINTF_VERSION_MAJOR 2 +#define PORTABLE_SNPRINTF_VERSION_MINOR 2 + +#if defined(NEED_ASPRINTF) || defined(NEED_ASNPRINTF) || defined(NEED_VASPRINTF) || defined(NEED_VASNPRINTF) +# if defined(NEED_SNPRINTF_ONLY) +# undef NEED_SNPRINTF_ONLY +# endif +# if !defined(PREFER_PORTABLE_SNPRINTF) +# define PREFER_PORTABLE_SNPRINTF +# endif +#endif + +#if defined(SOLARIS_BUG_COMPATIBLE) && !defined(SOLARIS_COMPATIBLE) +#define SOLARIS_COMPATIBLE +#endif + +#if defined(HPUX_BUG_COMPATIBLE) && !defined(HPUX_COMPATIBLE) +#define HPUX_COMPATIBLE +#endif + +#if defined(DIGITAL_UNIX_BUG_COMPATIBLE) && !defined(DIGITAL_UNIX_COMPATIBLE) +#define DIGITAL_UNIX_COMPATIBLE +#endif + +#if defined(PERL_BUG_COMPATIBLE) && !defined(PERL_COMPATIBLE) +#define PERL_COMPATIBLE +#endif + +#if defined(LINUX_BUG_COMPATIBLE) && !defined(LINUX_COMPATIBLE) +#define LINUX_COMPATIBLE +#endif + +#include +#include +#include +#include +#include +#include +#include + +#ifdef isdigit +#undef isdigit +#endif +#define isdigit(c) ((c) >= '0' && (c) <= '9') + +/* For copying strings longer or equal to 'breakeven_point' + * it is more efficient to call memcpy() than to do it inline. + * The value depends mostly on the processor architecture, + * but also on the compiler and its optimization capabilities. + * The value is not critical, some small value greater than zero + * will be just fine if you don't care to squeeze every drop + * of performance out of the code. + * + * Small values favor memcpy, large values favor inline code. + */ +#if defined(__alpha__) || defined(__alpha) +# define breakeven_point 2 /* AXP (DEC Alpha) - gcc or cc or egcs */ +#endif +#if defined(__i386__) || defined(__i386) +# define breakeven_point 12 /* Intel Pentium/Linux - gcc 2.96 */ +#endif +#if defined(__hppa) +# define breakeven_point 10 /* HP-PA - gcc */ +#endif +#if defined(__sparc__) || defined(__sparc) +# define breakeven_point 33 /* Sun Sparc 5 - gcc 2.8.1 */ +#endif + +/* some other values of possible interest: */ +/* #define breakeven_point 8 */ /* VAX 4000 - vaxc */ +/* #define breakeven_point 19 */ /* VAX 4000 - gcc 2.7.0 */ + +#ifndef breakeven_point +# define breakeven_point 6 /* some reasonable one-size-fits-all value */ +#endif + +#define fast_memcpy(d,s,n) \ + { register size_t nn = (size_t)(n); \ + if (nn >= breakeven_point) memcpy((d), (s), nn); \ + else if (nn > 0) { /* proc call overhead is worth only for large strings*/\ + register char *dd; register const char *ss; \ + for (ss=(s), dd=(d); nn>0; nn--) *dd++ = *ss++; } } + +#define fast_memset(d,c,n) \ + { register size_t nn = (size_t)(n); \ + if (nn >= breakeven_point) memset((d), (int)(c), nn); \ + else if (nn > 0) { /* proc call overhead is worth only for large strings*/\ + register char *dd; register const int cc=(int)(c); \ + for (dd=(d); nn>0; nn--) *dd++ = cc; } } + +/* prototypes */ + +#if defined(NEED_ASPRINTF) +int asprintf (char **ptr, const char *fmt, /*args*/ ...); +#endif +#if defined(NEED_VASPRINTF) +int vasprintf (char **ptr, const char *fmt, va_list ap); +#endif +#if defined(NEED_ASNPRINTF) +int asnprintf (char **ptr, size_t str_m, const char *fmt, /*args*/ ...); +#endif +#if defined(NEED_VASNPRINTF) +int vasnprintf (char **ptr, size_t str_m, const char *fmt, va_list ap); +#endif + +#if defined(HAVE_SNPRINTF) +/* declare our portable snprintf routine under name portable_snprintf */ +/* declare our portable vsnprintf routine under name portable_vsnprintf */ +#else +/* declare our portable routines under names snprintf and vsnprintf */ +#define portable_snprintf snprintf +#if !defined(NEED_SNPRINTF_ONLY) +#define portable_vsnprintf vsnprintf +#endif +#endif + +#if !defined(HAVE_SNPRINTF) || defined(PREFER_PORTABLE_SNPRINTF) +int portable_snprintf(char *str, size_t str_m, const char *fmt, /*args*/ ...); +#if !defined(NEED_SNPRINTF_ONLY) +int portable_vsnprintf(char *str, size_t str_m, const char *fmt, va_list ap); +#endif +#endif + +/* declarations */ + +static char credits[] = "\n\ +@(#)snprintf.c, v2.2: Mark Martinec, \n\ +@(#)snprintf.c, v2.2: Copyright 1999, Mark Martinec. Frontier Artistic License applies.\n\ +@(#)snprintf.c, v2.2: http://www.ijs.si/software/snprintf/\n"; + +#if defined(NEED_ASPRINTF) +int asprintf(char **ptr, const char *fmt, /*args*/ ...) { + va_list ap; + size_t str_m; + int str_l; + + *ptr = NULL; + va_start(ap, fmt); /* measure the required size */ + str_l = portable_vsnprintf(NULL, (size_t)0, fmt, ap); + va_end(ap); + assert(str_l >= 0); /* possible integer overflow if str_m > INT_MAX */ + *ptr = (char *) malloc(str_m = (size_t)str_l + 1); + if (*ptr == NULL) { errno = ENOMEM; str_l = -1; } + else { + int str_l2; + va_start(ap, fmt); + str_l2 = portable_vsnprintf(*ptr, str_m, fmt, ap); + va_end(ap); + assert(str_l2 == str_l); + } + return str_l; +} +#endif + +#if defined(NEED_VASPRINTF) +int vasprintf(char **ptr, const char *fmt, va_list ap) { + size_t str_m; + int str_l; + + *ptr = NULL; + { va_list ap2; + va_copy(ap2, ap); /* don't consume the original ap, we'll need it again */ + str_l = portable_vsnprintf(NULL, (size_t)0, fmt, ap2);/*get required size*/ + va_end(ap2); + } + assert(str_l >= 0); /* possible integer overflow if str_m > INT_MAX */ + *ptr = (char *) malloc(str_m = (size_t)str_l + 1); + if (*ptr == NULL) { errno = ENOMEM; str_l = -1; } + else { + int str_l2 = portable_vsnprintf(*ptr, str_m, fmt, ap); + assert(str_l2 == str_l); + } + return str_l; +} +#endif + +#if defined(NEED_ASNPRINTF) +int asnprintf (char **ptr, size_t str_m, const char *fmt, /*args*/ ...) { + va_list ap; + int str_l; + + *ptr = NULL; + va_start(ap, fmt); /* measure the required size */ + str_l = portable_vsnprintf(NULL, (size_t)0, fmt, ap); + va_end(ap); + assert(str_l >= 0); /* possible integer overflow if str_m > INT_MAX */ + if ((size_t)str_l + 1 < str_m) str_m = (size_t)str_l + 1; /* truncate */ + /* if str_m is 0, no buffer is allocated, just set *ptr to NULL */ + if (str_m == 0) { /* not interested in resulting string, just return size */ + } else { + *ptr = (char *) malloc(str_m); + if (*ptr == NULL) { errno = ENOMEM; str_l = -1; } + else { + int str_l2; + va_start(ap, fmt); + str_l2 = portable_vsnprintf(*ptr, str_m, fmt, ap); + va_end(ap); + assert(str_l2 == str_l); + } + } + return str_l; +} +#endif + +#if defined(NEED_VASNPRINTF) +int vasnprintf (char **ptr, size_t str_m, const char *fmt, va_list ap) { + int str_l; + + *ptr = NULL; + { va_list ap2; + va_copy(ap2, ap); /* don't consume the original ap, we'll need it again */ + str_l = portable_vsnprintf(NULL, (size_t)0, fmt, ap2);/*get required size*/ + va_end(ap2); + } + assert(str_l >= 0); /* possible integer overflow if str_m > INT_MAX */ + if ((size_t)str_l + 1 < str_m) str_m = (size_t)str_l + 1; /* truncate */ + /* if str_m is 0, no buffer is allocated, just set *ptr to NULL */ + if (str_m == 0) { /* not interested in resulting string, just return size */ + } else { + *ptr = (char *) malloc(str_m); + if (*ptr == NULL) { errno = ENOMEM; str_l = -1; } + else { + int str_l2 = portable_vsnprintf(*ptr, str_m, fmt, ap); + assert(str_l2 == str_l); + } + } + return str_l; +} +#endif + +/* + * If the system does have snprintf and the portable routine is not + * specifically required, this module produces no code for snprintf/vsnprintf. + */ +#if !defined(HAVE_SNPRINTF) || defined(PREFER_PORTABLE_SNPRINTF) + +#if !defined(NEED_SNPRINTF_ONLY) +int portable_snprintf(char *str, size_t str_m, const char *fmt, /*args*/ ...) { + va_list ap; + int str_l; + + va_start(ap, fmt); + str_l = portable_vsnprintf(str, str_m, fmt, ap); + va_end(ap); + return str_l; +} +#endif + +#if defined(NEED_SNPRINTF_ONLY) +int portable_snprintf(char *str, size_t str_m, const char *fmt, /*args*/ ...) { +#else +int portable_vsnprintf(char *str, size_t str_m, const char *fmt, va_list ap) { +#endif + +#if defined(NEED_SNPRINTF_ONLY) + va_list ap; +#endif + size_t str_l = 0; + const char *p = fmt; + +/* In contrast with POSIX, the ISO C99 now says + * that str can be NULL and str_m can be 0. + * This is more useful than the old: if (str_m < 1) return -1; */ + +#if defined(NEED_SNPRINTF_ONLY) + va_start(ap, fmt); +#endif + if (!p) p = ""; + while (*p) { + if (*p != '%') { + /* if (str_l < str_m) str[str_l++] = *p++; -- this would be sufficient */ + /* but the following code achieves better performance for cases + * where format string is long and contains few conversions */ + const char *q = strchr(p+1,'%'); + size_t n = !q ? strlen(p) : (q-p); + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memcpy(str+str_l, p, (n>avail?avail:n)); + } + p += n; str_l += n; + } else { + const char *starting_p; + size_t min_field_width = 0, precision = 0; + int zero_padding = 0, precision_specified = 0, justify_left = 0; + int alternate_form = 0, force_sign = 0; + int space_for_positive = 1; /* If both the ' ' and '+' flags appear, + the ' ' flag should be ignored. */ + char length_modifier = '\0'; /* allowed values: \0, h, l, L */ + char tmp[32];/* temporary buffer for simple numeric->string conversion */ + + const char *str_arg; /* string address in case of string argument */ + size_t str_arg_l; /* natural field width of arg without padding + and sign */ + unsigned char uchar_arg; + /* unsigned char argument value - only defined for c conversion. + N.B. standard explicitly states the char argument for + the c conversion is unsigned */ + + size_t number_of_zeros_to_pad = 0; + /* number of zeros to be inserted for numeric conversions + as required by the precision or minimal field width */ + + size_t zero_padding_insertion_ind = 0; + /* index into tmp where zero padding is to be inserted */ + + char fmt_spec = '\0'; + /* current conversion specifier character */ + + str_arg = credits;/* just to make compiler happy (defined but not used)*/ + str_arg = NULL; + starting_p = p; p++; /* skip '%' */ + /* parse flags */ + while (*p == '0' || *p == '-' || *p == '+' || + *p == ' ' || *p == '#' || *p == '\'') { + switch (*p) { + case '0': zero_padding = 1; break; + case '-': justify_left = 1; break; + case '+': force_sign = 1; space_for_positive = 0; break; + case ' ': force_sign = 1; + /* If both the ' ' and '+' flags appear, the ' ' flag should be ignored */ +#ifdef PERL_COMPATIBLE + /* ... but in Perl the last of ' ' and '+' applies */ + space_for_positive = 1; +#endif + break; + case '#': alternate_form = 1; break; + case '\'': break; + } + p++; + } + /* If the '0' and '-' flags both appear, the '0' flag should be ignored. */ + + /* parse field width */ + if (*p == '*') { + int j; + p++; j = va_arg(ap, int); + if (j >= 0) min_field_width = j; + else { min_field_width = -j; justify_left = 1; } + } else if (isdigit((int)(*p))) { + /* size_t could be wider than unsigned int; + make sure we treat argument like common implementations do */ + unsigned int uj = *p++ - '0'; + while (isdigit((int)(*p))) uj = 10*uj + (unsigned int)(*p++ - '0'); + min_field_width = uj; + } + /* parse precision */ + if (*p == '.') { + p++; precision_specified = 1; + if (*p == '*') { + int j = va_arg(ap, int); + p++; + if (j >= 0) precision = j; + else { + precision_specified = 0; precision = 0; + /* NOTE: + * Solaris 2.6 man page claims that in this case the precision + * should be set to 0. Digital Unix 4.0, HPUX 10 and BSD man page + * claim that this case should be treated as unspecified precision, + * which is what we do here. + */ + } + } else if (isdigit((int)(*p))) { + /* size_t could be wider than unsigned int; + make sure we treat argument like common implementations do */ + unsigned int uj = *p++ - '0'; + while (isdigit((int)(*p))) uj = 10*uj + (unsigned int)(*p++ - '0'); + precision = uj; + } + } + /* parse 'h', 'l' and 'll' length modifiers */ + if (*p == 'h' || *p == 'l') { + length_modifier = *p; p++; + if (length_modifier == 'l' && *p == 'l') { /* double l = long long */ +#ifdef SNPRINTF_LONGLONG_SUPPORT + length_modifier = '2'; /* double l encoded as '2' */ +#else + length_modifier = 'l'; /* treat it as a single 'l' */ +#endif + p++; + } + } + fmt_spec = *p; + /* common synonyms: */ + switch (fmt_spec) { + case 'i': fmt_spec = 'd'; break; + case 'D': fmt_spec = 'd'; length_modifier = 'l'; break; + case 'U': fmt_spec = 'u'; length_modifier = 'l'; break; + case 'O': fmt_spec = 'o'; length_modifier = 'l'; break; + default: break; + } + /* get parameter value, do initial processing */ + switch (fmt_spec) { + case '%': /* % behaves similar to 's' regarding flags and field widths */ + case 'c': /* c behaves similar to 's' regarding flags and field widths */ + case 's': + length_modifier = '\0'; /* wint_t and wchar_t not supported */ + /* the result of zero padding flag with non-numeric conversion specifier*/ + /* is undefined. Solaris and HPUX 10 does zero padding in this case, */ + /* Digital Unix and Linux does not. */ +#if !defined(SOLARIS_COMPATIBLE) && !defined(HPUX_COMPATIBLE) + zero_padding = 0; /* turn zero padding off for string conversions */ +#endif + str_arg_l = 1; + switch (fmt_spec) { + case '%': + str_arg = p; break; + case 'c': { + int j = va_arg(ap, int); + uchar_arg = (unsigned char) j; /* standard demands unsigned char */ + str_arg = (const char *) &uchar_arg; + break; + } + case 's': + str_arg = va_arg(ap, const char *); + if (!str_arg) str_arg_l = 0; + /* make sure not to address string beyond the specified precision !!! */ + else if (!precision_specified) str_arg_l = strlen(str_arg); + /* truncate string if necessary as requested by precision */ + else if (precision == 0) str_arg_l = 0; + else { + /* memchr on HP does not like n > 2^31 !!! */ + const char *q = memchr(str_arg, '\0', + precision <= 0x7fffffff ? precision : 0x7fffffff); + str_arg_l = !q ? precision : (q-str_arg); + } + break; + default: break; + } + break; + case 'd': case 'u': case 'o': case 'x': case 'X': case 'p': { + /* NOTE: the u, o, x, X and p conversion specifiers imply + the value is unsigned; d implies a signed value */ + + int arg_sign = 0; + /* 0 if numeric argument is zero (or if pointer is NULL for 'p'), + +1 if greater than zero (or nonzero for unsigned arguments), + -1 if negative (unsigned argument is never negative) */ + + int int_arg = 0; unsigned int uint_arg = 0; + /* only defined for length modifier h, or for no length modifiers */ + + long int long_arg = 0; unsigned long int ulong_arg = 0; + /* only defined for length modifier l */ + + void *ptr_arg = NULL; + /* pointer argument value -only defined for p conversion */ + +#ifdef SNPRINTF_LONGLONG_SUPPORT + long long int long_long_arg = 0; + unsigned long long int ulong_long_arg = 0; + /* only defined for length modifier ll */ +#endif + if (fmt_spec == 'p') { + /* HPUX 10: An l, h, ll or L before any other conversion character + * (other than d, i, u, o, x, or X) is ignored. + * Digital Unix: + * not specified, but seems to behave as HPUX does. + * Solaris: If an h, l, or L appears before any other conversion + * specifier (other than d, i, u, o, x, or X), the behavior + * is undefined. (Actually %hp converts only 16-bits of address + * and %llp treats address as 64-bit data which is incompatible + * with (void *) argument on a 32-bit system). + */ +#ifdef SOLARIS_COMPATIBLE +# ifdef SOLARIS_BUG_COMPATIBLE + /* keep length modifiers even if it represents 'll' */ +# else + if (length_modifier == '2') length_modifier = '\0'; +# endif +#else + length_modifier = '\0'; +#endif + ptr_arg = va_arg(ap, void *); + if (ptr_arg != NULL) arg_sign = 1; + } else if (fmt_spec == 'd') { /* signed */ + switch (length_modifier) { + case '\0': + case 'h': + /* It is non-portable to specify a second argument of char or short + * to va_arg, because arguments seen by the called function + * are not char or short. C converts char and short arguments + * to int before passing them to a function. + */ + int_arg = va_arg(ap, int); + if (int_arg > 0) arg_sign = 1; + else if (int_arg < 0) arg_sign = -1; + break; + case 'l': + long_arg = va_arg(ap, long int); + if (long_arg > 0) arg_sign = 1; + else if (long_arg < 0) arg_sign = -1; + break; +#ifdef SNPRINTF_LONGLONG_SUPPORT + case '2': + long_long_arg = va_arg(ap, long long int); + if (long_long_arg > 0) arg_sign = 1; + else if (long_long_arg < 0) arg_sign = -1; + break; +#endif + } + } else { /* unsigned */ + switch (length_modifier) { + case '\0': + case 'h': + uint_arg = va_arg(ap, unsigned int); + if (uint_arg) arg_sign = 1; + break; + case 'l': + ulong_arg = va_arg(ap, unsigned long int); + if (ulong_arg) arg_sign = 1; + break; +#ifdef SNPRINTF_LONGLONG_SUPPORT + case '2': + ulong_long_arg = va_arg(ap, unsigned long long int); + if (ulong_long_arg) arg_sign = 1; + break; +#endif + } + } + str_arg = tmp; str_arg_l = 0; + /* NOTE: + * For d, i, u, o, x, and X conversions, if precision is specified, + * the '0' flag should be ignored. This is so with Solaris 2.6, + * Digital UNIX 4.0, HPUX 10, Linux, FreeBSD, NetBSD; but not with Perl. + */ +#ifndef PERL_COMPATIBLE + if (precision_specified) zero_padding = 0; +#endif + if (fmt_spec == 'd') { + if (force_sign && arg_sign >= 0) + tmp[str_arg_l++] = space_for_positive ? ' ' : '+'; + /* leave negative numbers for sprintf to handle, + to avoid handling tricky cases like (short int)(-32768) */ +#ifdef LINUX_COMPATIBLE + } else if (fmt_spec == 'p' && force_sign && arg_sign > 0) { + tmp[str_arg_l++] = space_for_positive ? ' ' : '+'; +#endif + } else if (alternate_form) { + if (arg_sign != 0 && (fmt_spec == 'x' || fmt_spec == 'X') ) + { tmp[str_arg_l++] = '0'; tmp[str_arg_l++] = fmt_spec; } + /* alternate form should have no effect for p conversion, but ... */ +#ifdef HPUX_COMPATIBLE + else if (fmt_spec == 'p' + /* HPUX 10: for an alternate form of p conversion, + * a nonzero result is prefixed by 0x. */ +#ifndef HPUX_BUG_COMPATIBLE + /* Actually it uses 0x prefix even for a zero value. */ + && arg_sign != 0 +#endif + ) { tmp[str_arg_l++] = '0'; tmp[str_arg_l++] = 'x'; } +#endif + } + zero_padding_insertion_ind = str_arg_l; + if (!precision_specified) precision = 1; /* default precision is 1 */ + if (precision == 0 && arg_sign == 0 +#if defined(HPUX_BUG_COMPATIBLE) || defined(LINUX_COMPATIBLE) + && fmt_spec != 'p' + /* HPUX 10 man page claims: With conversion character p the result of + * converting a zero value with a precision of zero is a null string. + * Actually HP returns all zeroes, and Linux returns "(nil)". */ +#endif + ) { + /* converted to null string */ + /* When zero value is formatted with an explicit precision 0, + the resulting formatted string is empty (d, i, u, o, x, X, p). */ + } else { + char f[5]; int f_l = 0; + f[f_l++] = '%'; /* construct a simple format string for sprintf */ + if (!length_modifier) { } + else if (length_modifier=='2') { f[f_l++] = 'l'; f[f_l++] = 'l'; } + else f[f_l++] = length_modifier; + f[f_l++] = fmt_spec; f[f_l++] = '\0'; + if (fmt_spec == 'p') str_arg_l += sprintf(tmp+str_arg_l, f, ptr_arg); + else if (fmt_spec == 'd') { /* signed */ + switch (length_modifier) { + case '\0': + case 'h': str_arg_l+=sprintf(tmp+str_arg_l, f, int_arg); break; + case 'l': str_arg_l+=sprintf(tmp+str_arg_l, f, long_arg); break; +#ifdef SNPRINTF_LONGLONG_SUPPORT + case '2': str_arg_l+=sprintf(tmp+str_arg_l,f,long_long_arg); break; +#endif + } + } else { /* unsigned */ + switch (length_modifier) { + case '\0': + case 'h': str_arg_l+=sprintf(tmp+str_arg_l, f, uint_arg); break; + case 'l': str_arg_l+=sprintf(tmp+str_arg_l, f, ulong_arg); break; +#ifdef SNPRINTF_LONGLONG_SUPPORT + case '2': str_arg_l+=sprintf(tmp+str_arg_l,f,ulong_long_arg);break; +#endif + } + } + /* include the optional minus sign and possible "0x" + in the region before the zero padding insertion point */ + if (zero_padding_insertion_ind < str_arg_l && + tmp[zero_padding_insertion_ind] == '-') { + zero_padding_insertion_ind++; + } + if (zero_padding_insertion_ind+1 < str_arg_l && + tmp[zero_padding_insertion_ind] == '0' && + (tmp[zero_padding_insertion_ind+1] == 'x' || + tmp[zero_padding_insertion_ind+1] == 'X') ) { + zero_padding_insertion_ind += 2; + } + } + { size_t num_of_digits = str_arg_l - zero_padding_insertion_ind; + if (alternate_form && fmt_spec == 'o' +#ifdef HPUX_COMPATIBLE /* ("%#.o",0) -> "" */ + && (str_arg_l > 0) +#endif +#ifdef DIGITAL_UNIX_BUG_COMPATIBLE /* ("%#o",0) -> "00" */ +#else + /* unless zero is already the first character */ + && !(zero_padding_insertion_ind < str_arg_l + && tmp[zero_padding_insertion_ind] == '0') +#endif + ) { /* assure leading zero for alternate-form octal numbers */ + if (!precision_specified || precision < num_of_digits+1) { + /* precision is increased to force the first character to be zero, + except if a zero value is formatted with an explicit precision + of zero */ + precision = num_of_digits+1; precision_specified = 1; + } + } + /* zero padding to specified precision? */ + if (num_of_digits < precision) + number_of_zeros_to_pad = precision - num_of_digits; + } + /* zero padding to specified minimal field width? */ + if (!justify_left && zero_padding) { + int n = min_field_width - (str_arg_l+number_of_zeros_to_pad); + if (n > 0) number_of_zeros_to_pad += n; + } + break; + } + default: /* unrecognized conversion specifier, keep format string as-is*/ + zero_padding = 0; /* turn zero padding off for non-numeric convers. */ +#ifndef DIGITAL_UNIX_COMPATIBLE + justify_left = 1; min_field_width = 0; /* reset flags */ +#endif +#if defined(PERL_COMPATIBLE) || defined(LINUX_COMPATIBLE) + /* keep the entire format string unchanged */ + str_arg = starting_p; str_arg_l = p - starting_p; + /* well, not exactly so for Linux, which does something inbetween, + * and I don't feel an urge to imitate it: "%+++++hy" -> "%+y" */ +#else + /* discard the unrecognized conversion, just keep * + * the unrecognized conversion character */ + str_arg = p; str_arg_l = 0; +#endif + if (*p) str_arg_l++; /* include invalid conversion specifier unchanged + if not at end-of-string */ + break; + } + if (*p) p++; /* step over the just processed conversion specifier */ + /* insert padding to the left as requested by min_field_width; + this does not include the zero padding in case of numerical conversions*/ + if (!justify_left) { /* left padding with blank or zero */ + int n = min_field_width - (str_arg_l+number_of_zeros_to_pad); + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memset(str+str_l, (zero_padding?'0':' '), (n>avail?avail:n)); + } + str_l += n; + } + } + /* zero padding as requested by the precision or by the minimal field width + * for numeric conversions required? */ + if (number_of_zeros_to_pad <= 0) { + /* will not copy first part of numeric right now, * + * force it to be copied later in its entirety */ + zero_padding_insertion_ind = 0; + } else { + /* insert first part of numerics (sign or '0x') before zero padding */ + int n = zero_padding_insertion_ind; + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memcpy(str+str_l, str_arg, (n>avail?avail:n)); + } + str_l += n; + } + /* insert zero padding as requested by the precision or min field width */ + n = number_of_zeros_to_pad; + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memset(str+str_l, '0', (n>avail?avail:n)); + } + str_l += n; + } + } + /* insert formatted string + * (or as-is conversion specifier for unknown conversions) */ + { int n = str_arg_l - zero_padding_insertion_ind; + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memcpy(str+str_l, str_arg+zero_padding_insertion_ind, + (n>avail?avail:n)); + } + str_l += n; + } + } + /* insert right padding */ + if (justify_left) { /* right blank padding to the field width */ + int n = min_field_width - (str_arg_l+number_of_zeros_to_pad); + if (n > 0) { + if (str_l < str_m) { + size_t avail = str_m-str_l; + fast_memset(str+str_l, ' ', (n>avail?avail:n)); + } + str_l += n; + } + } + } + } +#if defined(NEED_SNPRINTF_ONLY) + va_end(ap); +#endif + if (str_m > 0) { /* make sure the string is null-terminated + even at the expense of overwriting the last character + (shouldn't happen, but just in case) */ + str[str_l <= str_m-1 ? str_l : str_m-1] = '\0'; + } + /* Return the number of characters formatted (excluding trailing null + * character), that is, the number of characters that would have been + * written to the buffer if it were large enough. + * + * The value of str_l should be returned, but str_l is of unsigned type + * size_t, and snprintf is int, possibly leading to an undetected + * integer overflow, resulting in a negative return value, which is illegal. + * Both XSH5 and ISO C99 (at least the draft) are silent on this issue. + * Should errno be set to EOVERFLOW and EOF returned in this case??? + */ + return (int) str_l; +} +#endif Index: branches/apertium-tagger/apertium2/apertium/win32/snprintf.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/snprintf.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/snprintf.h (revision 69632) @@ -0,0 +1,26 @@ +#ifndef _PORTABLE_SNPRINTF_H_ +#define _PORTABLE_SNPRINTF_H_ + +#define PORTABLE_SNPRINTF_VERSION_MAJOR 2 +#define PORTABLE_SNPRINTF_VERSION_MINOR 2 + +#ifdef HAVE_SNPRINTF +#include +#else +extern int snprintf(char *, size_t, const char *, /*args*/ ...); +extern int vsnprintf(char *, size_t, const char *, va_list); +#endif + +#if defined(HAVE_SNPRINTF) && defined(PREFER_PORTABLE_SNPRINTF) +extern int portable_snprintf(char *str, size_t str_m, const char *fmt, /*args*/ ...); +extern int portable_vsnprintf(char *str, size_t str_m, const char *fmt, va_list ap); +#define snprintf portable_snprintf +#define vsnprintf portable_vsnprintf +#endif + +extern int asprintf (char **ptr, const char *fmt, /*args*/ ...); +extern int vasprintf (char **ptr, const char *fmt, va_list ap); +extern int asnprintf (char **ptr, size_t str_m, const char *fmt, /*args*/ ...); +extern int vasnprintf(char **ptr, size_t str_m, const char *fmt, va_list ap); + +#endif Index: branches/apertium-tagger/apertium2/apertium/win32/libgen.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/libgen.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/libgen.c (revision 69632) @@ -0,0 +1,25 @@ +#include + +#include "libgen.h" + +// http://www.opengroup.org/onlinepubs/007908775/xsh/basename.html + +char* basename(char *path) { + if (path != NULL) { + // Find the last position of the \ in the path name + char* pos = strrchr(path, '\\'); + + if (pos != NULL) { // If a \ char was found... + if (pos + 1 != NULL) // If it is not the last character in the string... + return pos + 1; // then return a pointer to the first character after \. + else + return pos; // else return a pointer to \ + + } else { // If a \ char was NOT found + return path; // return the pointer passed to basename (this is probably non-conformant) + } + + } else { // If path == NULL, return "." + return "."; + } +} Index: branches/apertium-tagger/apertium2/apertium/win32/libgen.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/libgen.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/libgen.h (revision 69632) @@ -0,0 +1,14 @@ +#ifndef LIBGEN_H +#define LIBGEN_H + +#ifdef __cplusplus + extern "C" { +#endif + +char *basename(char *); + +#ifdef __cplusplus + } +#endif + +#endif Index: branches/apertium-tagger/apertium2/apertium/win32/regex.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/regex.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/regex.c (revision 69632) @@ -0,0 +1,4948 @@ +/* Extended regular expression matching and search library, + version 0.12. + (Implements POSIX draft P10003.2/D11.2, except for + internationalization features.) + + Copyright (C) 1993 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* AIX requires this to be the first thing in the file. */ +#if defined (_AIX) && !defined (REGEX_MALLOC) + #pragma alloca +#endif + +#define _GNU_SOURCE + +/* We need this for `regex.h', and perhaps for the Emacs include files. */ +#include + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* The `emacs' switch turns on certain matching commands + that make sense only in Emacs. */ +#ifdef emacs + +#include "lisp.h" +#include "buffer.h" +#include "syntax.h" + +/* Emacs uses `NULL' as a predicate. */ +#undef NULL + +#else /* not emacs */ + +/* We used to test for `BSTRING' here, but only GCC and Emacs define + `BSTRING', as far as I know, and neither of them use this code. */ +#if HAVE_STRING_H || STDC_HEADERS +#include +#ifndef bcmp +#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) +#endif +#ifndef bcopy +#define bcopy(s, d, n) memcpy ((d), (s), (n)) +#endif +#ifndef bzero +#define bzero(s, n) memset ((s), 0, (n)) +#endif +#else +#include +#endif + +#ifdef STDC_HEADERS +#include +#else +char *malloc (); +char *realloc (); +#endif + + +/* Define the syntax stuff for \<, \>, etc. */ + +/* This must be nonzero for the wordchar and notwordchar pattern + commands in re_match_2. */ +#ifndef Sword +#define Sword 1 +#endif + +#ifdef SYNTAX_TABLE + +extern char *re_syntax_table; + +#else /* not SYNTAX_TABLE */ + +/* How many characters in the character set. */ +#define CHAR_SET_SIZE 256 + +static char re_syntax_table[CHAR_SET_SIZE]; + +static void +init_syntax_once () +{ + register int c; + static int done = 0; + + if (done) + return; + + bzero (re_syntax_table, sizeof re_syntax_table); + + for (c = 'a'; c <= 'z'; c++) + re_syntax_table[c] = Sword; + + for (c = 'A'; c <= 'Z'; c++) + re_syntax_table[c] = Sword; + + for (c = '0'; c <= '9'; c++) + re_syntax_table[c] = Sword; + + re_syntax_table['_'] = Sword; + + done = 1; +} + +#endif /* not SYNTAX_TABLE */ + +#define SYNTAX(c) re_syntax_table[c] + +#endif /* not emacs */ + +/* Get the interface, including the syntax bits. */ +#include "regex.h" + +/* isalpha etc. are used for the character classes. */ +#include + +#ifndef isascii +#define isascii(c) 1 +#endif + +#ifdef isblank +#define ISBLANK(c) (isascii (c) && isblank (c)) +#else +#define ISBLANK(c) ((c) == ' ' || (c) == '\t') +#endif +#ifdef isgraph +#define ISGRAPH(c) (isascii (c) && isgraph (c)) +#else +#define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) +#endif + +#define ISPRINT(c) (isascii (c) && isprint (c)) +#define ISDIGIT(c) (isascii (c) && isdigit (c)) +#define ISALNUM(c) (isascii (c) && isalnum (c)) +#define ISALPHA(c) (isascii (c) && isalpha (c)) +#define ISCNTRL(c) (isascii (c) && iscntrl (c)) +#define ISLOWER(c) (isascii (c) && islower (c)) +#define ISPUNCT(c) (isascii (c) && ispunct (c)) +#define ISSPACE(c) (isascii (c) && isspace (c)) +#define ISUPPER(c) (isascii (c) && isupper (c)) +#define ISXDIGIT(c) (isascii (c) && isxdigit (c)) + +#ifndef NULL +#define NULL 0 +#endif + +/* We remove any previous definition of `SIGN_EXTEND_CHAR', + since ours (we hope) works properly with all combinations of + machines, compilers, `char' and `unsigned char' argument types. + (Per Bothner suggested the basic approach.) */ +#undef SIGN_EXTEND_CHAR +#if __STDC__ +#define SIGN_EXTEND_CHAR(c) ((signed char) (c)) +#else /* not __STDC__ */ +/* As in Harbison and Steele. */ +#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) +#endif + +/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we + use `alloca' instead of `malloc'. This is because using malloc in + re_search* or re_match* could cause memory leaks when C-g is used in + Emacs; also, malloc is slower and causes storage fragmentation. On + the other hand, malloc is more portable, and easier to debug. + + Because we sometimes use alloca, some routines have to be macros, + not functions -- `alloca'-allocated space disappears at the end of the + function it is called in. */ + +#ifdef REGEX_MALLOC + +#define REGEX_ALLOCATE malloc +#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) + +#else /* not REGEX_MALLOC */ + +/* Emacs already defines alloca, sometimes. */ +#ifndef alloca + +/* Make alloca work the best possible way. */ +#ifdef __GNUC__ +#define alloca __builtin_alloca +#else /* not __GNUC__ */ +#if HAVE_ALLOCA_H +#include +#else /* not __GNUC__ or HAVE_ALLOCA_H */ +#ifndef _AIX /* Already did AIX, up at the top. */ +char *alloca (); +#endif /* not _AIX */ +#endif /* not HAVE_ALLOCA_H */ +#endif /* not __GNUC__ */ + +#endif /* not alloca */ + +#define REGEX_ALLOCATE alloca + +/* Assumes a `char *destination' variable. */ +#define REGEX_REALLOCATE(source, osize, nsize) \ + (destination = (char *) alloca (nsize), \ + bcopy (source, destination, osize), \ + destination) + +#endif /* not REGEX_MALLOC */ + + +/* True if `size1' is non-NULL and PTR is pointing anywhere inside + `string1' or just past its end. This works if PTR is NULL, which is + a good thing. */ +#define FIRST_STRING_P(ptr) \ + (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) + +/* (Re)Allocate N items of type T using malloc, or fail. */ +#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) +#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) +#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) + +#define BYTEWIDTH 8 /* In bits. */ + +#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +typedef char boolean; +#define false 0 +#define true 1 + +/* These are the command codes that appear in compiled regular + expressions. Some opcodes are followed by argument bytes. A + command code can specify any interpretation whatsoever for its + arguments. Zero bytes may appear in the compiled regular expression. + + The value of `exactn' is needed in search.c (search_buffer) in Emacs. + So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of + `exactn' we use here must also be 1. */ + +typedef enum +{ + no_op = 0, + + /* Followed by one byte giving n, then by n literal bytes. */ + exactn = 1, + + /* Matches any (more or less) character. */ + anychar, + + /* Matches any one char belonging to specified set. First + following byte is number of bitmap bytes. Then come bytes + for a bitmap saying which chars are in. Bits in each byte + are ordered low-bit-first. A character is in the set if its + bit is 1. A character too large to have a bit in the map is + automatically not in the set. */ + charset, + + /* Same parameters as charset, but match any character that is + not one of those specified. */ + charset_not, + + /* Start remembering the text that is matched, for storing in a + register. Followed by one byte with the register number, in + the range 0 to one less than the pattern buffer's re_nsub + field. Then followed by one byte with the number of groups + inner to this one. (This last has to be part of the + start_memory only because we need it in the on_failure_jump + of re_match_2.) */ + start_memory, + + /* Stop remembering the text that is matched and store it in a + memory register. Followed by one byte with the register + number, in the range 0 to one less than `re_nsub' in the + pattern buffer, and one byte with the number of inner groups, + just like `start_memory'. (We need the number of inner + groups here because we don't have any easy way of finding the + corresponding start_memory when we're at a stop_memory.) */ + stop_memory, + + /* Match a duplicate of something remembered. Followed by one + byte containing the register number. */ + duplicate, + + /* Fail unless at beginning of line. */ + begline, + + /* Fail unless at end of line. */ + endline, + + /* Succeeds if at beginning of buffer (if emacs) or at beginning + of string to be matched (if not). */ + begbuf, + + /* Analogously, for end of buffer/string. */ + endbuf, + + /* Followed by two byte relative address to which to jump. */ + jump, + + /* Same as jump, but marks the end of an alternative. */ + jump_past_alt, + + /* Followed by two-byte relative address of place to resume at + in case of failure. */ + on_failure_jump, + + /* Like on_failure_jump, but pushes a placeholder instead of the + current string position when executed. */ + on_failure_keep_string_jump, + + /* Throw away latest failure point and then jump to following + two-byte relative address. */ + pop_failure_jump, + + /* Change to pop_failure_jump if know won't have to backtrack to + match; otherwise change to jump. This is used to jump + back to the beginning of a repeat. If what follows this jump + clearly won't match what the repeat does, such that we can be + sure that there is no use backtracking out of repetitions + already matched, then we change it to a pop_failure_jump. + Followed by two-byte address. */ + maybe_pop_jump, + + /* Jump to following two-byte address, and push a dummy failure + point. This failure point will be thrown away if an attempt + is made to use it for a failure. A `+' construct makes this + before the first repeat. Also used as an intermediary kind + of jump when compiling an alternative. */ + dummy_failure_jump, + + /* Push a dummy failure point and continue. Used at the end of + alternatives. */ + push_dummy_failure, + + /* Followed by two-byte relative address and two-byte number n. + After matching N times, jump to the address upon failure. */ + succeed_n, + + /* Followed by two-byte relative address, and two-byte number n. + Jump to the address N times, then fail. */ + jump_n, + + /* Set the following two-byte relative address to the + subsequent two-byte number. The address *includes* the two + bytes of number. */ + set_number_at, + + wordchar, /* Matches any word-constituent character. */ + notwordchar, /* Matches any char that is not a word-constituent. */ + + wordbeg, /* Succeeds if at word beginning. */ + wordend, /* Succeeds if at word end. */ + + wordbound, /* Succeeds if at a word boundary. */ + notwordbound /* Succeeds if not at a word boundary. */ + +#ifdef emacs + ,before_dot, /* Succeeds if before point. */ + at_dot, /* Succeeds if at point. */ + after_dot, /* Succeeds if after point. */ + + /* Matches any character whose syntax is specified. Followed by + a byte which contains a syntax code, e.g., Sword. */ + syntaxspec, + + /* Matches any character whose syntax is not that specified. */ + notsyntaxspec +#endif /* emacs */ +} re_opcode_t; + +/* Common operations on the compiled pattern. */ + +/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ + +#define STORE_NUMBER(destination, number) \ + do { \ + (destination)[0] = (number) & 0377; \ + (destination)[1] = (number) >> 8; \ + } while (0) + +/* Same as STORE_NUMBER, except increment DESTINATION to + the byte after where the number is stored. Therefore, DESTINATION + must be an lvalue. */ + +#define STORE_NUMBER_AND_INCR(destination, number) \ + do { \ + STORE_NUMBER (destination, number); \ + (destination) += 2; \ + } while (0) + +/* Put into DESTINATION a number stored in two contiguous bytes starting + at SOURCE. */ + +#define EXTRACT_NUMBER(destination, source) \ + do { \ + (destination) = *(source) & 0377; \ + (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ + } while (0) + +#ifdef DEBUG +static void +extract_number (dest, source) + int *dest; + unsigned char *source; +{ + int temp = SIGN_EXTEND_CHAR (*(source + 1)); + *dest = *source & 0377; + *dest += temp << 8; +} + +#ifndef EXTRACT_MACROS /* To debug the macros. */ +#undef EXTRACT_NUMBER +#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) +#endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. + SOURCE must be an lvalue. */ + +#define EXTRACT_NUMBER_AND_INCR(destination, source) \ + do { \ + EXTRACT_NUMBER (destination, source); \ + (source) += 2; \ + } while (0) + +#ifdef DEBUG +static void +extract_number_and_incr (destination, source) + int *destination; + unsigned char **source; +{ + extract_number (destination, *source); + *source += 2; +} + +#ifndef EXTRACT_MACROS +#undef EXTRACT_NUMBER_AND_INCR +#define EXTRACT_NUMBER_AND_INCR(dest, src) \ + extract_number_and_incr (&dest, &src) +#endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* If DEBUG is defined, Regex prints many voluminous messages about what + it is doing (if the variable `debug' is nonzero). If linked with the + main program in `iregex.c', you can enter patterns and strings + interactively. And if linked with the main program in `main.c' and + the other test files, you can run the already-written tests. */ + +#ifdef DEBUG + +/* We use standard I/O for debugging. */ +#include + +/* It is useful to test things that ``must'' be true when debugging. */ +#include + +static int debug = 0; + +#define DEBUG_STATEMENT(e) e +#define DEBUG_PRINT1(x) if (debug) printf (x) +#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) +#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) +#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) +#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ + if (debug) print_partial_compiled_pattern (s, e) +#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ + if (debug) print_double_string (w, s1, sz1, s2, sz2) + + +extern void printchar (); + +/* Print the fastmap in human-readable form. */ + +void +print_fastmap (fastmap) + char *fastmap; +{ + unsigned was_a_range = 0; + unsigned i = 0; + + while (i < (1 << BYTEWIDTH)) + { + if (fastmap[i++]) + { + was_a_range = 0; + printchar (i - 1); + while (i < (1 << BYTEWIDTH) && fastmap[i]) + { + was_a_range = 1; + i++; + } + if (was_a_range) + { + printf ("-"); + printchar (i - 1); + } + } + } + putchar ('\n'); +} + + +/* Print a compiled pattern string in human-readable form, starting at + the START pointer into it and ending just before the pointer END. */ + +void +print_partial_compiled_pattern (start, end) + unsigned char *start; + unsigned char *end; +{ + int mcnt, mcnt2; + unsigned char *p = start; + unsigned char *pend = end; + + if (start == NULL) + { + printf ("(null)\n"); + return; + } + + /* Loop over pattern commands. */ + while (p < pend) + { + switch ((re_opcode_t) *p++) + { + case no_op: + printf ("/no_op"); + break; + + case exactn: + mcnt = *p++; + printf ("/exactn/%d", mcnt); + do + { + putchar ('/'); + printchar (*p++); + } + while (--mcnt); + break; + + case start_memory: + mcnt = *p++; + printf ("/start_memory/%d/%d", mcnt, *p++); + break; + + case stop_memory: + mcnt = *p++; + printf ("/stop_memory/%d/%d", mcnt, *p++); + break; + + case duplicate: + printf ("/duplicate/%d", *p++); + break; + + case anychar: + printf ("/anychar"); + break; + + case charset: + case charset_not: + { + register int c; + + printf ("/charset%s", + (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); + + assert (p + *p < pend); + + for (c = 0; c < *p; c++) + { + unsigned bit; + unsigned char map_byte = p[1 + c]; + + putchar ('/'); + + for (bit = 0; bit < BYTEWIDTH; bit++) + if (map_byte & (1 << bit)) + printchar (c * BYTEWIDTH + bit); + } + p += 1 + *p; + break; + } + + case begline: + printf ("/begline"); + break; + + case endline: + printf ("/endline"); + break; + + case on_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_jump/0/%d", mcnt); + break; + + case on_failure_keep_string_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_keep_string_jump/0/%d", mcnt); + break; + + case dummy_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/dummy_failure_jump/0/%d", mcnt); + break; + + case push_dummy_failure: + printf ("/push_dummy_failure"); + break; + + case maybe_pop_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/maybe_pop_jump/0/%d", mcnt); + break; + + case pop_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/pop_failure_jump/0/%d", mcnt); + break; + + case jump_past_alt: + extract_number_and_incr (&mcnt, &p); + printf ("/jump_past_alt/0/%d", mcnt); + break; + + case jump: + extract_number_and_incr (&mcnt, &p); + printf ("/jump/0/%d", mcnt); + break; + + case succeed_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2); + break; + + case jump_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2); + break; + + case set_number_at: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2); + break; + + case wordbound: + printf ("/wordbound"); + break; + + case notwordbound: + printf ("/notwordbound"); + break; + + case wordbeg: + printf ("/wordbeg"); + break; + + case wordend: + printf ("/wordend"); + +#ifdef emacs + case before_dot: + printf ("/before_dot"); + break; + + case at_dot: + printf ("/at_dot"); + break; + + case after_dot: + printf ("/after_dot"); + break; + + case syntaxspec: + printf ("/syntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; + + case notsyntaxspec: + printf ("/notsyntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; +#endif /* emacs */ + + case wordchar: + printf ("/wordchar"); + break; + + case notwordchar: + printf ("/notwordchar"); + break; + + case begbuf: + printf ("/begbuf"); + break; + + case endbuf: + printf ("/endbuf"); + break; + + default: + printf ("?%d", *(p-1)); + } + } + printf ("/\n"); +} + + +void +print_compiled_pattern (bufp) + struct re_pattern_buffer *bufp; +{ + unsigned char *buffer = bufp->buffer; + + print_partial_compiled_pattern (buffer, buffer + bufp->used); + printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); + + if (bufp->fastmap_accurate && bufp->fastmap) + { + printf ("fastmap: "); + print_fastmap (bufp->fastmap); + } + + printf ("re_nsub: %d\t", bufp->re_nsub); + printf ("regs_alloc: %d\t", bufp->regs_allocated); + printf ("can_be_null: %d\t", bufp->can_be_null); + printf ("newline_anchor: %d\n", bufp->newline_anchor); + printf ("no_sub: %d\t", bufp->no_sub); + printf ("not_bol: %d\t", bufp->not_bol); + printf ("not_eol: %d\t", bufp->not_eol); + printf ("syntax: %d\n", bufp->syntax); + /* Perhaps we should print the translate table? */ +} + + +void +print_double_string (where, string1, size1, string2, size2) + const char *where; + const char *string1; + const char *string2; + int size1; + int size2; +{ + unsigned this_char; + + if (where == NULL) + printf ("(null)"); + else + { + if (FIRST_STRING_P (where)) + { + for (this_char = where - string1; this_char < size1; this_char++) + printchar (string1[this_char]); + + where = string2; + } + + for (this_char = where - string2; this_char < size2; this_char++) + printchar (string2[this_char]); + } +} + +#else /* not DEBUG */ + +#undef assert +#define assert(e) + +#define DEBUG_STATEMENT(e) +#define DEBUG_PRINT1(x) +#define DEBUG_PRINT2(x1, x2) +#define DEBUG_PRINT3(x1, x2, x3) +#define DEBUG_PRINT4(x1, x2, x3, x4) +#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) +#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) + +#endif /* not DEBUG */ + +/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ +reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; + + +/* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + +reg_syntax_t +re_set_syntax (syntax) + reg_syntax_t syntax; +{ + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; + return ret; +} + +/* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. */ + +static const char *re_error_msg[] = + { NULL, /* REG_NOERROR */ + "No match", /* REG_NOMATCH */ + "Invalid regular expression", /* REG_BADPAT */ + "Invalid collation character", /* REG_ECOLLATE */ + "Invalid character class name", /* REG_ECTYPE */ + "Trailing backslash", /* REG_EESCAPE */ + "Invalid back reference", /* REG_ESUBREG */ + "Unmatched [ or [^", /* REG_EBRACK */ + "Unmatched ( or \\(", /* REG_EPAREN */ + "Unmatched \\{", /* REG_EBRACE */ + "Invalid content of \\{\\}", /* REG_BADBR */ + "Invalid range end", /* REG_ERANGE */ + "Memory exhausted", /* REG_ESPACE */ + "Invalid preceding regular expression", /* REG_BADRPT */ + "Premature end of regular expression", /* REG_EEND */ + "Regular expression too big", /* REG_ESIZE */ + "Unmatched ) or \\)", /* REG_ERPAREN */ + }; + +/* Subroutine declarations and macros for regex_compile. */ + +static void store_op1 (), store_op2 (); +static void insert_op1 (), insert_op2 (); +static boolean at_begline_loc_p (), at_endline_loc_p (); +static boolean group_in_compile_stack (); +static reg_errcode_t compile_range (); + +/* Fetch the next character in the uncompiled pattern---translating it + if necessary. Also cast from a signed character in the constant + string passed to us by the user to an unsigned char that we can use + as an array index (in, e.g., `translate'). */ +#define PATFETCH(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + if (translate) c = translate[c]; \ + } while (0) + +/* Fetch the next character in the uncompiled pattern, with no + translation. */ +#define PATFETCH_RAW(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + } while (0) + +/* Go backwards one character in the pattern. */ +#define PATUNFETCH p-- + + +/* If `translate' is non-null, return translate[D], else just D. We + cast the subscript to translate because some data is declared as + `char *', to avoid warnings when a string constant is passed. But + when we use a character as a subscript we must make it unsigned. */ +#define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) + + +/* Macros for outputting the compiled pattern into `buffer'. */ + +/* If the buffer isn't allocated when it comes in, use this. */ +#define INIT_BUF_SIZE 32 + +/* Make sure we have at least N more bytes of space in buffer. */ +#define GET_BUFFER_SPACE(n) \ + while (b - bufp->buffer + (n) > bufp->allocated) \ + EXTEND_BUFFER () + +/* Make sure we have one more byte of buffer space and then add C to it. */ +#define BUF_PUSH(c) \ + do { \ + GET_BUFFER_SPACE (1); \ + *b++ = (unsigned char) (c); \ + } while (0) + + +/* Ensure we have two more bytes of buffer space and then append C1 and C2. */ +#define BUF_PUSH_2(c1, c2) \ + do { \ + GET_BUFFER_SPACE (2); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + } while (0) + + +/* As with BUF_PUSH_2, except for three bytes. */ +#define BUF_PUSH_3(c1, c2, c3) \ + do { \ + GET_BUFFER_SPACE (3); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + *b++ = (unsigned char) (c3); \ + } while (0) + + +/* Store a jump with opcode OP at LOC to location TO. We store a + relative address offset by the three bytes the jump itself occupies. */ +#define STORE_JUMP(op, loc, to) \ + store_op1 (op, loc, (to) - (loc) - 3) + +/* Likewise, for a two-argument jump. */ +#define STORE_JUMP2(op, loc, to, arg) \ + store_op2 (op, loc, (to) - (loc) - 3, arg) + +/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP(op, loc, to) \ + insert_op1 (op, loc, (to) - (loc) - 3, b) + +/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP2(op, loc, to, arg) \ + insert_op2 (op, loc, (to) - (loc) - 3, arg, b) + + +/* This is not an arbitrary limit: the arguments which represent offsets + into the pattern are two bytes long. So if 2^16 bytes turns out to + be too small, many things would have to change. */ +#define MAX_BUF_SIZE (1L << 16) + + +/* Extend the buffer by twice its current size via realloc and + reset the pointers that pointed into the old block to point to the + correct places in the new one. If extending the buffer results in it + being larger than MAX_BUF_SIZE, then flag memory exhausted. */ +#define EXTEND_BUFFER() \ + do { \ + unsigned char *old_buffer = bufp->buffer; \ + if (bufp->allocated == MAX_BUF_SIZE) \ + return REG_ESIZE; \ + bufp->allocated <<= 1; \ + if (bufp->allocated > MAX_BUF_SIZE) \ + bufp->allocated = MAX_BUF_SIZE; \ + bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ + if (bufp->buffer == NULL) \ + return REG_ESPACE; \ + /* If the buffer moved, move all the pointers into it. */ \ + if (old_buffer != bufp->buffer) \ + { \ + b = (b - old_buffer) + bufp->buffer; \ + begalt = (begalt - old_buffer) + bufp->buffer; \ + if (fixup_alt_jump) \ + fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ + if (laststart) \ + laststart = (laststart - old_buffer) + bufp->buffer; \ + if (pending_exact) \ + pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ + } \ + } while (0) + + +/* Since we have one byte reserved for the register number argument to + {start,stop}_memory, the maximum number of groups we can report + things about is what fits in that byte. */ +#define MAX_REGNUM 255 + +/* But patterns can have more than `MAX_REGNUM' registers. We just + ignore the excess. */ +typedef unsigned regnum_t; + + +/* Macros for the compile stack. */ + +/* Since offsets can go either forwards or backwards, this type needs to + be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ +typedef int pattern_offset_t; + +typedef struct +{ + pattern_offset_t begalt_offset; + pattern_offset_t fixup_alt_jump; + pattern_offset_t inner_group_offset; + pattern_offset_t laststart_offset; + regnum_t regnum; +} compile_stack_elt_t; + + +typedef struct +{ + compile_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} compile_stack_type; + + +#define INIT_COMPILE_STACK_SIZE 32 + +#define COMPILE_STACK_EMPTY (compile_stack.avail == 0) +#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) + +/* The next available element. */ +#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) + + +/* Set the bit for character C in a list. */ +#define SET_LIST_BIT(c) \ + (b[((unsigned char) (c)) / BYTEWIDTH] \ + |= 1 << (((unsigned char) c) % BYTEWIDTH)) + + +/* Get the next unsigned number in the uncompiled pattern. */ +#define GET_UNSIGNED_NUMBER(num) \ + { if (p != pend) \ + { \ + PATFETCH (c); \ + while (ISDIGIT (c)) \ + { \ + if (num < 0) \ + num = 0; \ + num = num * 10 + c - '0'; \ + if (p == pend) \ + break; \ + PATFETCH (c); \ + } \ + } \ + } + +#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ + +#define IS_CHAR_CLASS(string) \ + (STREQ (string, "alpha") || STREQ (string, "upper") \ + || STREQ (string, "lower") || STREQ (string, "digit") \ + || STREQ (string, "alnum") || STREQ (string, "xdigit") \ + || STREQ (string, "space") || STREQ (string, "print") \ + || STREQ (string, "punct") || STREQ (string, "graph") \ + || STREQ (string, "cntrl") || STREQ (string, "blank")) + +/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. + Returns one of error codes defined in `regex.h', or zero for success. + + Assumes the `allocated' (and perhaps `buffer') and `translate' + fields are set in BUFP on entry. + + If it succeeds, results are put in BUFP (if it returns an error, the + contents of BUFP are undefined): + `buffer' is the compiled pattern; + `syntax' is set to SYNTAX; + `used' is set to the length of the compiled pattern; + `fastmap_accurate' is zero; + `re_nsub' is the number of subexpressions in PATTERN; + `not_bol' and `not_eol' are zero; + + The `fastmap' and `newline_anchor' fields are neither + examined nor set. */ + +static reg_errcode_t +regex_compile (pattern, size, syntax, bufp) + const char *pattern; + int size; + reg_syntax_t syntax; + struct re_pattern_buffer *bufp; +{ + /* We fetch characters from PATTERN here. Even though PATTERN is + `char *' (i.e., signed), we declare these variables as unsigned, so + they can be reliably used as array indices. */ + register unsigned char c, c1; + + /* A random tempory spot in PATTERN. */ + const char *p1; + + /* Points to the end of the buffer, where we should append. */ + register unsigned char *b; + + /* Keeps track of unclosed groups. */ + compile_stack_type compile_stack; + + /* Points to the current (ending) position in the pattern. */ + const char *p = pattern; + const char *pend = pattern + size; + + /* How to translate the characters in the pattern. */ + char *translate = bufp->translate; + + /* Address of the count-byte of the most recently inserted `exactn' + command. This makes it possible to tell if a new exact-match + character can be added to that command or if the character requires + a new `exactn' command. */ + unsigned char *pending_exact = 0; + + /* Address of start of the most recently finished expression. + This tells, e.g., postfix * where to find the start of its + operand. Reset at the beginning of groups and alternatives. */ + unsigned char *laststart = 0; + + /* Address of beginning of regexp, or inside of last group. */ + unsigned char *begalt; + + /* Place in the uncompiled pattern (i.e., the {) to + which to go back if the interval is invalid. */ + const char *beg_interval; + + /* Address of the place where a forward jump should go to the end of + the containing expression. Each alternative of an `or' -- except the + last -- ends with a forward jump of this sort. */ + unsigned char *fixup_alt_jump = 0; + + /* Counts open-groups as they are encountered. Remembered for the + matching close-group on the compile stack, so the same register + number is put in the stop_memory as the start_memory. */ + regnum_t regnum = 0; + +#ifdef DEBUG + DEBUG_PRINT1 ("\nCompiling pattern: "); + if (debug) + { + unsigned debug_count; + + for (debug_count = 0; debug_count < size; debug_count++) + printchar (pattern[debug_count]); + putchar ('\n'); + } +#endif /* DEBUG */ + + /* Initialize the compile stack. */ + compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size = INIT_COMPILE_STACK_SIZE; + compile_stack.avail = 0; + + /* Initialize the pattern buffer. */ + bufp->syntax = syntax; + bufp->fastmap_accurate = 0; + bufp->not_bol = bufp->not_eol = 0; + + /* Set `used' to zero, so that if we return an error, the pattern + printer (for debugging) will think there's no pattern. We reset it + at the end. */ + bufp->used = 0; + + /* Always count groups, whether or not bufp->no_sub is set. */ + bufp->re_nsub = 0; + +#if !defined (emacs) && !defined (SYNTAX_TABLE) + /* Initialize the syntax table. */ + init_syntax_once (); +#endif + + if (bufp->allocated == 0) + { + if (bufp->buffer) + { /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. */ + RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); + } + else + { /* Caller did not allocate a buffer. Do it for them. */ + bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); + } + if (!bufp->buffer) return REG_ESPACE; + + bufp->allocated = INIT_BUF_SIZE; + } + + begalt = b = bufp->buffer; + + /* Loop through the uncompiled pattern until we're at the end. */ + while (p != pend) + { + PATFETCH (c); + + switch (c) + { + case '^': + { + if ( /* If at start of pattern, it's an operator. */ + p == pattern + 1 + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's come before. */ + || at_begline_loc_p (pattern, p, syntax)) + BUF_PUSH (begline); + else + goto normal_char; + } + break; + + + case '$': + { + if ( /* If at end of pattern, it's an operator. */ + p == pend + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's next. */ + || at_endline_loc_p (p, pend, syntax)) + BUF_PUSH (endline); + else + goto normal_char; + } + break; + + + case '+': + case '?': + if ((syntax & RE_BK_PLUS_QM) + || (syntax & RE_LIMITED_OPS)) + goto normal_char; + handle_plus: + case '*': + /* If there is no previous pattern... */ + if (!laststart) + { + if (syntax & RE_CONTEXT_INVALID_OPS) + return REG_BADRPT; + else if (!(syntax & RE_CONTEXT_INDEP_OPS)) + goto normal_char; + } + + { + /* Are we optimizing this jump? */ + boolean keep_string_p = false; + + /* 1 means zero (many) matches is allowed. */ + char zero_times_ok = 0, many_times_ok = 0; + + /* If there is a sequence of repetition chars, collapse it + down to just one (the right one). We can't combine + interval operators with these because of, e.g., `a{2}*', + which should only match an even number of `a's. */ + + for (;;) + { + zero_times_ok |= c != '+'; + many_times_ok |= c != '?'; + + if (p == pend) + break; + + PATFETCH (c); + + if (c == '*' + || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) + ; + + else if (syntax & RE_BK_PLUS_QM && c == '\\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + if (!(c1 == '+' || c1 == '?')) + { + PATUNFETCH; + PATUNFETCH; + break; + } + + c = c1; + } + else + { + PATUNFETCH; + break; + } + + /* If we get here, we found another repeat character. */ + } + + /* Star, etc. applied to an empty pattern is equivalent + to an empty pattern. */ + if (!laststart) + break; + + /* Now we know whether or not zero matches is allowed + and also whether or not two or more matches is allowed. */ + if (many_times_ok) + { /* More than one repetition is allowed, so put in at the + end a backward relative jump from `b' to before the next + jump we're going to put in below (which jumps from + laststart to after this jump). + + But if we are at the `*' in the exact sequence `.*\n', + insert an unconditional jump backwards to the ., + instead of the beginning of the loop. This way we only + push a failure point once, instead of every time + through the loop. */ + assert (p - 1 > pattern); + + /* Allocate the space for the jump. */ + GET_BUFFER_SPACE (3); + + /* We know we are not at the first character of the pattern, + because laststart was nonzero. And we've already + incremented `p', by the way, to be the character after + the `*'. Do we have to do something analogous here + for null bytes, because of RE_DOT_NOT_NULL? */ + if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') + && zero_times_ok + && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') + && !(syntax & RE_DOT_NEWLINE)) + { /* We have .*\n. */ + STORE_JUMP (jump, b, laststart); + keep_string_p = true; + } + else + /* Anything else. */ + STORE_JUMP (maybe_pop_jump, b, laststart - 3); + + /* We've added more stuff to the buffer. */ + b += 3; + } + + /* On failure, jump from laststart to b + 3, which will be the + end of the buffer after this jump is inserted. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump + : on_failure_jump, + laststart, b + 3); + pending_exact = 0; + b += 3; + + if (!zero_times_ok) + { + /* At least one repetition is required, so insert a + `dummy_failure_jump' before the initial + `on_failure_jump' instruction of the loop. This + effects a skip over that instruction the first time + we hit that loop. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); + b += 3; + } + } + break; + + + case '.': + laststart = b; + BUF_PUSH (anychar); + break; + + + case '[': + { + boolean had_char_class = false; + + if (p == pend) return REG_EBRACK; + + /* Ensure that we have enough space to push a charset: the + opcode, the length count, and the bitset; 34 bytes in all. */ + GET_BUFFER_SPACE (34); + + laststart = b; + + /* We test `*p == '^' twice, instead of using an if + statement, so we only need one BUF_PUSH. */ + BUF_PUSH (*p == '^' ? charset_not : charset); + if (*p == '^') + p++; + + /* Remember the first position in the bracket expression. */ + p1 = p; + + /* Push the number of bytes in the bitmap. */ + BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); + + /* Clear the whole map. */ + bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-2] == charset_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) + SET_LIST_BIT ('\n'); + + /* Read in characters and ranges, setting map bits. */ + for (;;) + { + if (p == pend) return REG_EBRACK; + + PATFETCH (c); + + /* \ might escape characters inside [...] and [^...]. */ + if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + SET_LIST_BIT (c1); + continue; + } + + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; + + /* Look ahead to see if it's a range when the last thing + was a character class. */ + if (had_char_class && c == '-' && *p != ']') + return REG_ERANGE; + + /* Look ahead to see if it's a range when the last thing + was a character: if this is a hyphen not at the + beginning or the end of a list, then it's the range + operator. */ + if (c == '-' + && !(p - 2 >= pattern && p[-2] == '[') + && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') + && *p != ']') + { + reg_errcode_t ret + = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + else if (p[0] == '-' && p[1] != ']') + { /* This handles ranges made up of characters only. */ + reg_errcode_t ret; + + /* Move past the `-'. */ + PATFETCH (c1); + + ret = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + /* See if we're at the beginning of a possible character + class. */ + + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') + { /* Leave room for the null. */ + char str[CHAR_CLASS_MAX_LENGTH + 1]; + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[:'. */ + if (p == pend) return REG_EBRACK; + + for (;;) + { + PATFETCH (c); + if (c == ':' || c == ']' || p == pend + || c1 == CHAR_CLASS_MAX_LENGTH) + break; + str[c1++] = c; + } + str[c1] = '\0'; + + /* If isn't a word bracketed by `[:' and:`]': + undo the ending character, the letters, and leave + the leading `:' and `[' (but set bits for them). */ + if (c == ':' && *p == ']') + { + int ch; + boolean is_alnum = STREQ (str, "alnum"); + boolean is_alpha = STREQ (str, "alpha"); + boolean is_blank = STREQ (str, "blank"); + boolean is_cntrl = STREQ (str, "cntrl"); + boolean is_digit = STREQ (str, "digit"); + boolean is_graph = STREQ (str, "graph"); + boolean is_lower = STREQ (str, "lower"); + boolean is_print = STREQ (str, "print"); + boolean is_punct = STREQ (str, "punct"); + boolean is_space = STREQ (str, "space"); + boolean is_upper = STREQ (str, "upper"); + boolean is_xdigit = STREQ (str, "xdigit"); + + if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; + + /* Throw away the ] at the end of the character + class. */ + PATFETCH (c); + + if (p == pend) return REG_EBRACK; + + for (ch = 0; ch < 1 << BYTEWIDTH; ch++) + { + if ( (is_alnum && ISALNUM (ch)) + || (is_alpha && ISALPHA (ch)) + || (is_blank && ISBLANK (ch)) + || (is_cntrl && ISCNTRL (ch)) + || (is_digit && ISDIGIT (ch)) + || (is_graph && ISGRAPH (ch)) + || (is_lower && ISLOWER (ch)) + || (is_print && ISPRINT (ch)) + || (is_punct && ISPUNCT (ch)) + || (is_space && ISSPACE (ch)) + || (is_upper && ISUPPER (ch)) + || (is_xdigit && ISXDIGIT (ch))) + SET_LIST_BIT (ch); + } + had_char_class = true; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT (':'); + had_char_class = false; + } + } + else + { + had_char_class = false; + SET_LIST_BIT (c); + } + } + + /* Discard any (non)matching list bytes that are all 0 at the + end of the map. Decrease the map-length byte too. */ + while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) + b[-1]--; + b += b[-1]; + } + break; + + + case '(': + if (syntax & RE_NO_BK_PARENS) + goto handle_open; + else + goto normal_char; + + + case ')': + if (syntax & RE_NO_BK_PARENS) + goto handle_close; + else + goto normal_char; + + + case '\n': + if (syntax & RE_NEWLINE_ALT) + goto handle_alt; + else + goto normal_char; + + + case '|': + if (syntax & RE_NO_BK_VBAR) + goto handle_alt; + else + goto normal_char; + + + case '{': + if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) + goto handle_interval; + else + goto normal_char; + + + case '\\': + if (p == pend) return REG_EESCAPE; + + /* Do not translate the character after the \, so that we can + distinguish, e.g., \B from \b, even if we normally would + translate, e.g., B to b. */ + PATFETCH_RAW (c); + + switch (c) + { + case '(': + if (syntax & RE_NO_BK_PARENS) + goto normal_backslash; + + handle_open: + bufp->re_nsub++; + regnum++; + + if (COMPILE_STACK_FULL) + { + RETALLOC (compile_stack.stack, compile_stack.size << 1, + compile_stack_elt_t); + if (compile_stack.stack == NULL) return REG_ESPACE; + + compile_stack.size <<= 1; + } + + /* These are the values to restore when we hit end of this + group. They are all relative offsets, so that if the + whole pattern moves because of realloc, they will still + be valid. */ + COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; + COMPILE_STACK_TOP.fixup_alt_jump + = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; + COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; + COMPILE_STACK_TOP.regnum = regnum; + + /* We will eventually replace the 0 with the number of + groups inner to this one. But do not push a + start_memory for groups beyond the last one we can + represent in the compiled pattern. */ + if (regnum <= MAX_REGNUM) + { + COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; + BUF_PUSH_3 (start_memory, regnum, 0); + } + + compile_stack.avail++; + + fixup_alt_jump = 0; + laststart = 0; + begalt = b; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + break; + + + case ')': + if (syntax & RE_NO_BK_PARENS) goto normal_backslash; + + if (COMPILE_STACK_EMPTY) + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_backslash; + else + return REG_ERPAREN; + + handle_close: + if (fixup_alt_jump) + { /* Push a dummy failure point at the end of the + alternative for a possible future + `pop_failure_jump' to pop. See comments at + `push_dummy_failure' in `re_match_2'. */ + BUF_PUSH (push_dummy_failure); + + /* We allocated space for this jump when we assigned + to `fixup_alt_jump', in the `handle_alt' case below. */ + STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); + } + + /* See similar code for backslashed left paren above. */ + if (COMPILE_STACK_EMPTY) + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_char; + else + return REG_ERPAREN; + + /* Since we just checked for an empty stack above, this + ``can't happen''. */ + assert (compile_stack.avail != 0); + { + /* We don't just want to restore into `regnum', because + later groups should continue to be numbered higher, + as in `(ab)c(de)' -- the second group is #2. */ + regnum_t this_group_regnum; + + compile_stack.avail--; + begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; + fixup_alt_jump + = COMPILE_STACK_TOP.fixup_alt_jump + ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 + : 0; + laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; + this_group_regnum = COMPILE_STACK_TOP.regnum; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + + /* We're at the end of the group, so now we know how many + groups were inside this one. */ + if (this_group_regnum <= MAX_REGNUM) + { + unsigned char *inner_group_loc + = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; + + *inner_group_loc = regnum - this_group_regnum; + BUF_PUSH_3 (stop_memory, this_group_regnum, + regnum - this_group_regnum); + } + } + break; + + + case '|': /* `\|'. */ + if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) + goto normal_backslash; + handle_alt: + if (syntax & RE_LIMITED_OPS) + goto normal_char; + + /* Insert before the previous alternative a jump which + jumps to this alternative if the former fails. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (on_failure_jump, begalt, b + 6); + pending_exact = 0; + b += 3; + + /* The alternative before this one has a jump after it + which gets executed if it gets matched. Adjust that + jump so it will jump to this alternative's analogous + jump (put in below, which in turn will jump to the next + (if any) alternative's such jump, etc.). The last such + jump jumps to the correct final destination. A picture: + _____ _____ + | | | | + | v | v + a | b | c + + If we are at `b', then fixup_alt_jump right now points to a + three-byte space after `a'. We'll put in the jump, set + fixup_alt_jump to right after `b', and leave behind three + bytes which we'll fill in when we get to after `c'. */ + + if (fixup_alt_jump) + STORE_JUMP (jump_past_alt, fixup_alt_jump, b); + + /* Mark and leave space for a jump after this alternative, + to be filled in later either by next alternative or + when know we're at the end of a series of alternatives. */ + fixup_alt_jump = b; + GET_BUFFER_SPACE (3); + b += 3; + + laststart = 0; + begalt = b; + break; + + + case '{': + /* If \{ is a literal. */ + if (!(syntax & RE_INTERVALS) + /* If we're at `\{' and it's not the open-interval + operator. */ + || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) + || (p - 2 == pattern && p == pend)) + goto normal_backslash; + + handle_interval: + { + /* If got here, then the syntax allows intervals. */ + + /* At least (most) this many matches must be made. */ + int lower_bound = -1, upper_bound = -1; + + beg_interval = p - 1; + + if (p == pend) + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_EBRACE; + } + + GET_UNSIGNED_NUMBER (lower_bound); + + if (c == ',') + { + GET_UNSIGNED_NUMBER (upper_bound); + if (upper_bound < 0) upper_bound = RE_DUP_MAX; + } + else + /* Interval such as `{1}' => match exactly once. */ + upper_bound = lower_bound; + + if (lower_bound < 0 || upper_bound > RE_DUP_MAX + || lower_bound > upper_bound) + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_BADBR; + } + + if (!(syntax & RE_NO_BK_BRACES)) + { + if (c != '\\') return REG_EBRACE; + + PATFETCH (c); + } + + if (c != '}') + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_BADBR; + } + + /* We just parsed a valid interval. */ + + /* If it's invalid to have no preceding re. */ + if (!laststart) + { + if (syntax & RE_CONTEXT_INVALID_OPS) + return REG_BADRPT; + else if (syntax & RE_CONTEXT_INDEP_OPS) + laststart = b; + else + goto unfetch_interval; + } + + /* If the upper bound is zero, don't want to succeed at + all; jump from `laststart' to `b + 3', which will be + the end of the buffer after we insert the jump. */ + if (upper_bound == 0) + { + GET_BUFFER_SPACE (3); + INSERT_JUMP (jump, laststart, b + 3); + b += 3; + } + + /* Otherwise, we have a nontrivial interval. When + we're all done, the pattern will look like: + set_number_at + set_number_at + succeed_n + + jump_n + (The upper bound and `jump_n' are omitted if + `upper_bound' is 1, though.) */ + else + { /* If the upper bound is > 1, we need to insert + more at the end of the loop. */ + unsigned nbytes = 10 + (upper_bound > 1) * 10; + + GET_BUFFER_SPACE (nbytes); + + /* Initialize lower bound of the `succeed_n', even + though it will be set during matching by its + attendant `set_number_at' (inserted next), + because `re_compile_fastmap' needs to know. + Jump to the `jump_n' we might insert below. */ + INSERT_JUMP2 (succeed_n, laststart, + b + 5 + (upper_bound > 1) * 5, + lower_bound); + b += 5; + + /* Code to initialize the lower bound. Insert + before the `succeed_n'. The `5' is the last two + bytes of this `set_number_at', plus 3 bytes of + the following `succeed_n'. */ + insert_op2 (set_number_at, laststart, 5, lower_bound, b); + b += 5; + + if (upper_bound > 1) + { /* More than one repetition is allowed, so + append a backward jump to the `succeed_n' + that starts this interval. + + When we've reached this during matching, + we'll have matched the interval once, so + jump back only `upper_bound - 1' times. */ + STORE_JUMP2 (jump_n, b, laststart + 5, + upper_bound - 1); + b += 5; + + /* The location we want to set is the second + parameter of the `jump_n'; that is `b-2' as + an absolute address. `laststart' will be + the `set_number_at' we're about to insert; + `laststart+3' the number to set, the source + for the relative address. But we are + inserting into the middle of the pattern -- + so everything is getting moved up by 5. + Conclusion: (b - 2) - (laststart + 3) + 5, + i.e., b - laststart. + + We insert this at the beginning of the loop + so that if we fail during matching, we'll + reinitialize the bounds. */ + insert_op2 (set_number_at, laststart, b - laststart, + upper_bound - 1, b); + b += 5; + } + } + pending_exact = 0; + beg_interval = NULL; + } + break; + + unfetch_interval: + /* If an invalid interval, match the characters as literals. */ + assert (beg_interval); + p = beg_interval; + beg_interval = NULL; + + /* normal_char and normal_backslash need `c'. */ + PATFETCH (c); + + if (!(syntax & RE_NO_BK_BRACES)) + { + if (p > pattern && p[-1] == '\\') + goto normal_backslash; + } + goto normal_char; + +#ifdef emacs + /* There is no way to specify the before_dot and after_dot + operators. rms says this is ok. --karl */ + case '=': + BUF_PUSH (at_dot); + break; + + case 's': + laststart = b; + PATFETCH (c); + BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); + break; + + case 'S': + laststart = b; + PATFETCH (c); + BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); + break; +#endif /* emacs */ + + + case 'w': + laststart = b; + BUF_PUSH (wordchar); + break; + + + case 'W': + laststart = b; + BUF_PUSH (notwordchar); + break; + + + case '<': + BUF_PUSH (wordbeg); + break; + + case '>': + BUF_PUSH (wordend); + break; + + case 'b': + BUF_PUSH (wordbound); + break; + + case 'B': + BUF_PUSH (notwordbound); + break; + + case '`': + BUF_PUSH (begbuf); + break; + + case '\'': + BUF_PUSH (endbuf); + break; + + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + if (syntax & RE_NO_BK_REFS) + goto normal_char; + + c1 = c - '0'; + + if (c1 > regnum) + return REG_ESUBREG; + + /* Can't back reference to a subexpression if inside of it. */ + if (group_in_compile_stack (compile_stack, c1)) + goto normal_char; + + laststart = b; + BUF_PUSH_2 (duplicate, c1); + break; + + + case '+': + case '?': + if (syntax & RE_BK_PLUS_QM) + goto handle_plus; + else + goto normal_backslash; + + default: + normal_backslash: + /* You might think it would be useful for \ to mean + not to translate; but if we don't translate it + it will never match anything. */ + c = TRANSLATE (c); + goto normal_char; + } + break; + + + default: + /* Expects the character in `c'. */ + normal_char: + /* If no exactn currently being built. */ + if (!pending_exact + + /* If last exactn not at current position. */ + || pending_exact + *pending_exact + 1 != b + + /* We have only one byte following the exactn for the count. */ + || *pending_exact == (1 << BYTEWIDTH) - 1 + + /* If followed by a repetition operator. */ + || *p == '*' || *p == '^' + || ((syntax & RE_BK_PLUS_QM) + ? *p == '\\' && (p[1] == '+' || p[1] == '?') + : (*p == '+' || *p == '?')) + || ((syntax & RE_INTERVALS) + && ((syntax & RE_NO_BK_BRACES) + ? *p == '{' + : (p[0] == '\\' && p[1] == '{')))) + { + /* Start building a new exactn. */ + + laststart = b; + + BUF_PUSH_2 (exactn, 0); + pending_exact = b - 1; + } + + BUF_PUSH (c); + (*pending_exact)++; + break; + } /* switch (c) */ + } /* while p != pend */ + + + /* Through the pattern now. */ + + if (fixup_alt_jump) + STORE_JUMP (jump_past_alt, fixup_alt_jump, b); + + if (!COMPILE_STACK_EMPTY) + return REG_EPAREN; + + free (compile_stack.stack); + + /* We have succeeded; set the length of the buffer. */ + bufp->used = b - bufp->buffer; + +#ifdef DEBUG + if (debug) + { + DEBUG_PRINT1 ("\nCompiled pattern: "); + print_compiled_pattern (bufp); + } +#endif /* DEBUG */ + + return REG_NOERROR; +} /* regex_compile */ + +/* Subroutines for `regex_compile'. */ + +/* Store OP at LOC followed by two-byte integer parameter ARG. */ + +static void +store_op1 (op, loc, arg) + re_opcode_t op; + unsigned char *loc; + int arg; +{ + *loc = (unsigned char) op; + STORE_NUMBER (loc + 1, arg); +} + + +/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ + +static void +store_op2 (op, loc, arg1, arg2) + re_opcode_t op; + unsigned char *loc; + int arg1, arg2; +{ + *loc = (unsigned char) op; + STORE_NUMBER (loc + 1, arg1); + STORE_NUMBER (loc + 3, arg2); +} + + +/* Copy the bytes from LOC to END to open up three bytes of space at LOC + for OP followed by two-byte integer parameter ARG. */ + +static void +insert_op1 (op, loc, arg, end) + re_opcode_t op; + unsigned char *loc; + int arg; + unsigned char *end; +{ + register unsigned char *pfrom = end; + register unsigned char *pto = end + 3; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op1 (op, loc, arg); +} + + +/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ + +static void +insert_op2 (op, loc, arg1, arg2, end) + re_opcode_t op; + unsigned char *loc; + int arg1, arg2; + unsigned char *end; +{ + register unsigned char *pfrom = end; + register unsigned char *pto = end + 5; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op2 (op, loc, arg1, arg2); +} + + +/* P points to just after a ^ in PATTERN. Return true if that ^ comes + after an alternative or a begin-subexpression. We assume there is at + least one character before the ^. */ + +static boolean +at_begline_loc_p (pattern, p, syntax) + const char *pattern, *p; + reg_syntax_t syntax; +{ + const char *prev = p - 2; + boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; + + return + /* After a subexpression? */ + (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) + /* After an alternative? */ + || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); +} + + +/* The dual of at_begline_loc_p. This one is for $. We assume there is + at least one character after the $, i.e., `P < PEND'. */ + +static boolean +at_endline_loc_p (p, pend, syntax) + const char *p, *pend; + int syntax; +{ + const char *next = p; + boolean next_backslash = *next == '\\'; + const char *next_next = p + 1 < pend ? p + 1 : NULL; + + return + /* Before a subexpression? */ + (syntax & RE_NO_BK_PARENS ? *next == ')' + : next_backslash && next_next && *next_next == ')') + /* Before an alternative? */ + || (syntax & RE_NO_BK_VBAR ? *next == '|' + : next_backslash && next_next && *next_next == '|'); +} + + +/* Returns true if REGNUM is in one of COMPILE_STACK's elements and + false if it's not. */ + +static boolean +group_in_compile_stack (compile_stack, regnum) + compile_stack_type compile_stack; + regnum_t regnum; +{ + int this_element; + + for (this_element = compile_stack.avail - 1; + this_element >= 0; + this_element--) + if (compile_stack.stack[this_element].regnum == regnum) + return true; + + return false; +} + + +/* Read the ending character of a range (in a bracket expression) from the + uncompiled pattern *P_PTR (which ends at PEND). We assume the + starting character is in `P[-2]'. (`P[-1]' is the character `-'.) + Then we set the translation of all bits between the starting and + ending characters (inclusive) in the compiled pattern B. + + Return an error code. + + We use these short variable names so we can use the same macros as + `regex_compile' itself. */ + +static reg_errcode_t +compile_range (p_ptr, pend, translate, syntax, b) + const char **p_ptr, *pend; + char *translate; + reg_syntax_t syntax; + unsigned char *b; +{ + unsigned this_char; + + const char *p = *p_ptr; + int range_start, range_end; + + if (p == pend) + return REG_ERANGE; + + /* Even though the pattern is a signed `char *', we need to fetch + with unsigned char *'s; if the high bit of the pattern character + is set, the range endpoints will be negative if we fetch using a + signed char *. + + We also want to fetch the endpoints without translating them; the + appropriate translation is done in the bit-setting loop below. */ + range_start = ((unsigned char *) p)[-2]; + range_end = ((unsigned char *) p)[0]; + + /* Have to increment the pointer into the pattern string, so the + caller isn't still at the ending character. */ + (*p_ptr)++; + + /* If the start is after the end, the range is empty. */ + if (range_start > range_end) + return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; + + /* Here we see why `this_char' has to be larger than an `unsigned + char' -- the range is inclusive, so if `range_end' == 0xff + (assuming 8-bit characters), we would otherwise go into an infinite + loop, since all characters <= 0xff. */ + for (this_char = range_start; this_char <= range_end; this_char++) + { + SET_LIST_BIT (TRANSLATE (this_char)); + } + + return REG_NOERROR; +} + +/* Failure stack declarations and macros; both re_compile_fastmap and + re_match_2 use a failure stack. These have to be macros because of + REGEX_ALLOCATE. */ + + +/* Number of failure points for which to initially allocate space + when matching. If this number is exceeded, we allocate more + space, so it is not a hard limit. */ +#ifndef INIT_FAILURE_ALLOC +#define INIT_FAILURE_ALLOC 5 +#endif + +/* Roughly the maximum number of failure points on the stack. Would be + exactly that if always used MAX_FAILURE_SPACE each time we failed. + This is a variable only so users of regex can assign to it; we never + change it ourselves. */ +int re_max_failures = 2000; + +typedef const unsigned char *fail_stack_elt_t; + +typedef struct +{ + fail_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} fail_stack_type; + +#define FAIL_STACK_EMPTY() (fail_stack.avail == 0) +#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) +#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) +#define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) + + +/* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ + +#define INIT_FAIL_STACK() \ + do { \ + fail_stack.stack = (fail_stack_elt_t *) \ + REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ + \ + if (fail_stack.stack == NULL) \ + return -2; \ + \ + fail_stack.size = INIT_FAILURE_ALLOC; \ + fail_stack.avail = 0; \ + } while (0) + + +/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. + + Return 1 if succeeds, and 0 if either ran out of memory + allocating space for it or it was already too large. + + REGEX_REALLOCATE requires `destination' be declared. */ + +#define DOUBLE_FAIL_STACK(fail_stack) \ + ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ + ? 0 \ + : ((fail_stack).stack = (fail_stack_elt_t *) \ + REGEX_REALLOCATE ((fail_stack).stack, \ + (fail_stack).size * sizeof (fail_stack_elt_t), \ + ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ + \ + (fail_stack).stack == NULL \ + ? 0 \ + : ((fail_stack).size <<= 1, \ + 1))) + + +/* Push PATTERN_OP on FAIL_STACK. + + Return 1 if was able to do so and 0 if ran out of memory allocating + space to do so. */ +#define PUSH_PATTERN_OP(pattern_op, fail_stack) \ + ((FAIL_STACK_FULL () \ + && !DOUBLE_FAIL_STACK (fail_stack)) \ + ? 0 \ + : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \ + 1)) + +/* This pushes an item onto the failure stack. Must be a four-byte + value. Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ +#define PUSH_FAILURE_ITEM(item) \ + fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item + +/* The complement operation. Assumes `fail_stack' is nonempty. */ +#define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail] + +/* Used to omit pushing failure point id's when we're not debugging. */ +#ifdef DEBUG +#define DEBUG_PUSH PUSH_FAILURE_ITEM +#define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM () +#else +#define DEBUG_PUSH(item) +#define DEBUG_POP(item_addr) +#endif + + +/* Push the information about the state we will need + if we ever fail back to it. + + Requires variables fail_stack, regstart, regend, reg_info, and + num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be + declared. + + Does `return FAILURE_CODE' if runs out of memory. */ + +#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ + do { \ + char *destination; \ + /* Must be int, so when we don't save any registers, the arithmetic \ + of 0 + -1 isn't done as unsigned. */ \ + int this_reg; \ + \ + DEBUG_STATEMENT (failure_id++); \ + DEBUG_STATEMENT (nfailure_points_pushed++); \ + DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ + DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ + DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ + \ + DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ + DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ + \ + /* Ensure we have enough space allocated for what we will push. */ \ + while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ + { \ + if (!DOUBLE_FAIL_STACK (fail_stack)) \ + return failure_code; \ + \ + DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ + (fail_stack).size); \ + DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ + } \ + \ + /* Push the info, starting with the registers. */ \ + DEBUG_PRINT1 ("\n"); \ + \ + for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ + this_reg++) \ + { \ + DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ + DEBUG_STATEMENT (num_regs_pushed++); \ + \ + DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ + PUSH_FAILURE_ITEM (regstart[this_reg]); \ + \ + DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ + PUSH_FAILURE_ITEM (regend[this_reg]); \ + \ + DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ + DEBUG_PRINT2 (" match_null=%d", \ + REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ + DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ + DEBUG_PRINT2 (" matched_something=%d", \ + MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT2 (" ever_matched=%d", \ + EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT1 ("\n"); \ + PUSH_FAILURE_ITEM (reg_info[this_reg].word); \ + } \ + \ + DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ + PUSH_FAILURE_ITEM (lowest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ + PUSH_FAILURE_ITEM (highest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ + PUSH_FAILURE_ITEM (pattern_place); \ + \ + DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ + DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ + size2); \ + DEBUG_PRINT1 ("'\n"); \ + PUSH_FAILURE_ITEM (string_place); \ + \ + DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ + DEBUG_PUSH (failure_id); \ + } while (0) + +/* This is the number of items that are pushed and popped on the stack + for each register. */ +#define NUM_REG_ITEMS 3 + +/* Individual items aside from the registers. */ +#ifdef DEBUG +#define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ +#else +#define NUM_NONREG_ITEMS 4 +#endif + +/* We push at most this many items on the stack. */ +#define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) + +/* We actually push this many items. */ +#define NUM_FAILURE_ITEMS \ + ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ + + NUM_NONREG_ITEMS) + +/* How many items can still be added to the stack without overflowing it. */ +#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) + + +/* Pops what PUSH_FAIL_STACK pushes. + + We restore into the parameters, all of which should be lvalues: + STR -- the saved data position. + PAT -- the saved pattern position. + LOW_REG, HIGH_REG -- the highest and lowest active registers. + REGSTART, REGEND -- arrays of string positions. + REG_INFO -- array of information about each subexpression. + + Also assumes the variables `fail_stack' and (if debugging), `bufp', + `pend', `string1', `size1', `string2', and `size2'. */ + +#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ +{ \ + DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ + int this_reg; \ + const unsigned char *string_temp; \ + \ + assert (!FAIL_STACK_EMPTY ()); \ + \ + /* Remove failure points and point to how many regs pushed. */ \ + DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ + DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ + DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ + \ + assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ + \ + DEBUG_POP (&failure_id); \ + DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ + \ + /* If the saved string location is NULL, it came from an \ + on_failure_keep_string_jump opcode, and we want to throw away the \ + saved NULL, thus retaining our current position in the string. */ \ + string_temp = POP_FAILURE_ITEM (); \ + if (string_temp != NULL) \ + str = (const char *) string_temp; \ + \ + DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ + DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ + DEBUG_PRINT1 ("'\n"); \ + \ + pat = (unsigned char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ + \ + /* Restore register info. */ \ + high_reg = (unsigned) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ + \ + low_reg = (unsigned) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ + \ + for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ + { \ + DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ + \ + reg_info[this_reg].word = POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ + \ + regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ + \ + regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ + } \ + \ + DEBUG_STATEMENT (nfailure_points_popped++); \ +} /* POP_FAILURE_POINT */ + +/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in + BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible + characters can start a string that matches the pattern. This fastmap + is used by re_search to skip quickly over impossible starting points. + + The caller must supply the address of a (1 << BYTEWIDTH)-byte data + area as BUFP->fastmap. + + We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in + the pattern buffer. + + Returns 0 if we succeed, -2 if an internal error. */ + +int +re_compile_fastmap (bufp) + struct re_pattern_buffer *bufp; +{ + int j, k; + fail_stack_type fail_stack; +#ifndef REGEX_MALLOC + char *destination; +#endif + /* We don't push any register information onto the failure stack. */ + unsigned num_regs = 0; + + register char *fastmap = bufp->fastmap; + unsigned char *pattern = bufp->buffer; + unsigned long size = bufp->used; + const unsigned char *p = pattern; + register unsigned char *pend = pattern + size; + + /* Assume that each path through the pattern can be null until + proven otherwise. We set this false at the bottom of switch + statement, to which we get only if a particular path doesn't + match the empty string. */ + boolean path_can_be_null = true; + + /* We aren't doing a `succeed_n' to begin with. */ + boolean succeed_n_p = false; + + assert (fastmap != NULL && p != NULL); + + INIT_FAIL_STACK (); + bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ + bufp->fastmap_accurate = 1; /* It will be when we're done. */ + bufp->can_be_null = 0; + + while (p != pend || !FAIL_STACK_EMPTY ()) + { + if (p == pend) + { + bufp->can_be_null |= path_can_be_null; + + /* Reset for next path. */ + path_can_be_null = true; + + p = fail_stack.stack[--fail_stack.avail]; + } + + /* We should never be about to go beyond the end of the pattern. */ + assert (p < pend); + +#ifdef SWITCH_ENUM_BUG + switch ((int) ((re_opcode_t) *p++)) +#else + switch ((re_opcode_t) *p++) +#endif + { + + /* I guess the idea here is to simply not bother with a fastmap + if a backreference is used, since it's too hard to figure out + the fastmap for the corresponding group. Setting + `can_be_null' stops `re_search_2' from using the fastmap, so + that is all we do. */ + case duplicate: + bufp->can_be_null = 1; + return 0; + + + /* Following are the cases which match a character. These end + with `break'. */ + + case exactn: + fastmap[p[1]] = 1; + break; + + + case charset: + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) + fastmap[j] = 1; + break; + + + case charset_not: + /* Chars beyond end of map must be allowed. */ + for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) + fastmap[j] = 1; + break; + + + case wordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) == Sword) + fastmap[j] = 1; + break; + + + case notwordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) != Sword) + fastmap[j] = 1; + break; + + + case anychar: + /* `.' matches anything ... */ + for (j = 0; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + /* ... except perhaps newline. */ + if (!(bufp->syntax & RE_DOT_NEWLINE)) + fastmap['\n'] = 0; + + /* Return if we have already set `can_be_null'; if we have, + then the fastmap is irrelevant. Something's wrong here. */ + else if (bufp->can_be_null) + return 0; + + /* Otherwise, have to check alternative paths. */ + break; + + +#ifdef emacs + case syntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) == (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + case notsyntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) != (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + /* All cases after this match the empty string. These end with + `continue'. */ + + + case before_dot: + case at_dot: + case after_dot: + continue; +#endif /* not emacs */ + + + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbound: + case notwordbound: + case wordbeg: + case wordend: + case push_dummy_failure: + continue; + + + case jump_n: + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case jump_past_alt: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR (j, p); + p += j; + if (j > 0) + continue; + + /* Jump backward implies we just went through the body of a + loop and matched nothing. Opcode jumped to should be + `on_failure_jump' or `succeed_n'. Just treat it like an + ordinary jump. For a * loop, it has pushed its failure + point already; if so, discard that as redundant. */ + if ((re_opcode_t) *p != on_failure_jump + && (re_opcode_t) *p != succeed_n) + continue; + + p++; + EXTRACT_NUMBER_AND_INCR (j, p); + p += j; + + /* If what's on the stack is where we are now, pop it. */ + if (!FAIL_STACK_EMPTY () + && fail_stack.stack[fail_stack.avail - 1] == p) + fail_stack.avail--; + + continue; + + + case on_failure_jump: + case on_failure_keep_string_jump: + handle_on_failure_jump: + EXTRACT_NUMBER_AND_INCR (j, p); + + /* For some patterns, e.g., `(a?)?', `p+j' here points to the + end of the pattern. We don't want to push such a point, + since when we restore it above, entering the switch will + increment `p' past the end of the pattern. We don't need + to push such a point since we obviously won't find any more + fastmap entries beyond `pend'. Such a pattern can match + the null string, though. */ + if (p + j < pend) + { + if (!PUSH_PATTERN_OP (p + j, fail_stack)) + return -2; + } + else + bufp->can_be_null = 1; + + if (succeed_n_p) + { + EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ + succeed_n_p = false; + } + + continue; + + + case succeed_n: + /* Get to the number of times to succeed. */ + p += 2; + + /* Increment p past the n for when k != 0. */ + EXTRACT_NUMBER_AND_INCR (k, p); + if (k == 0) + { + p -= 4; + succeed_n_p = true; /* Spaghetti code alert. */ + goto handle_on_failure_jump; + } + continue; + + + case set_number_at: + p += 4; + continue; + + + case start_memory: + case stop_memory: + p += 2; + continue; + + + default: + abort (); /* We have listed all the cases. */ + } /* switch *p++ */ + + /* Getting here means we have found the possible starting + characters for one path of the pattern -- and that the empty + string does not match. We need not follow this path further. + Instead, look at the next alternative (remembered on the + stack), or quit if no more. The test at the top of the loop + does these things. */ + path_can_be_null = false; + p = pend; + } /* while p */ + + /* Set `can_be_null' for the last path (also the first path, if the + pattern is empty). */ + bufp->can_be_null |= path_can_be_null; + return 0; +} /* re_compile_fastmap */ + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use + this memory for recording register information. STARTS and ENDS + must be allocated using the malloc library routine, and must each + be at least NUM_REGS * sizeof (regoff_t) bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ + +void +re_set_registers (bufp, regs, num_regs, starts, ends) + struct re_pattern_buffer *bufp; + struct re_registers *regs; + unsigned num_regs; + regoff_t *starts, *ends; +{ + if (num_regs) + { + bufp->regs_allocated = REGS_REALLOCATE; + regs->num_regs = num_regs; + regs->start = starts; + regs->end = ends; + } + else + { + bufp->regs_allocated = REGS_UNALLOCATED; + regs->num_regs = 0; + regs->start = regs->end = (regoff_t) 0; + } +} + +/* Searching routines. */ + +/* Like re_search_2, below, but only one string is specified, and + doesn't let you say where to stop matching. */ + +int +re_search (bufp, string, size, startpos, range, regs) + struct re_pattern_buffer *bufp; + const char *string; + int size, startpos, range; + struct re_registers *regs; +{ + return re_search_2 (bufp, NULL, 0, string, size, startpos, range, + regs, size); +} + + +/* Using the compiled pattern in BUFP->buffer, first tries to match the + virtual concatenation of STRING1 and STRING2, starting first at index + STARTPOS, then at STARTPOS + 1, and so on. + + STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. + + RANGE is how far to scan while trying to match. RANGE = 0 means try + only at STARTPOS; in general, the last start tried is STARTPOS + + RANGE. + + In REGS, return the indices of the virtual concatenation of STRING1 + and STRING2 that matched the entire BUFP->buffer and its contained + subexpressions. + + Do not consider matching one past the index STOP in the virtual + concatenation of STRING1 and STRING2. + + We return either the position in the strings at which the match was + found, -1 if no match, or -2 if error (such as failure + stack overflow). */ + +int +re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int size1, size2; + int startpos; + int range; + struct re_registers *regs; + int stop; +{ + int val; + register char *fastmap = bufp->fastmap; + register char *translate = bufp->translate; + int total_size = size1 + size2; + int endpos = startpos + range; + + /* Check for out-of-range STARTPOS. */ + if (startpos < 0 || startpos > total_size) + return -1; + + /* Fix up RANGE if it might eventually take us outside + the virtual concatenation of STRING1 and STRING2. */ + if (endpos < -1) + range = -1 - startpos; + else if (endpos > total_size) + range = total_size - startpos; + + /* If the search isn't to be a backwards one, don't waste time in a + search for a pattern that must be anchored. */ + if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) + { + if (startpos > 0) + return -1; + else + range = 1; + } + + /* Update the fastmap now if not correct already. */ + if (fastmap && !bufp->fastmap_accurate) + if (re_compile_fastmap (bufp) == -2) + return -2; + + /* Loop through the string, looking for a place to start matching. */ + for (;;) + { + /* If a fastmap is supplied, skip quickly over characters that + cannot be the start of a match. If the pattern can match the + null string, however, we don't need to skip characters; we want + the first null string. */ + if (fastmap && startpos < total_size && !bufp->can_be_null) + { + if (range > 0) /* Searching forwards. */ + { + register const char *d; + register int lim = 0; + int irange = range; + + if (startpos < size1 && startpos + range >= size1) + lim = range - (size1 - startpos); + + d = (startpos >= size1 ? string2 - size1 : string1) + startpos; + + /* Written out as an if-else to avoid testing `translate' + inside the loop. */ + if (translate) + while (range > lim + && !fastmap[(unsigned char) + translate[(unsigned char) *d++]]) + range--; + else + while (range > lim && !fastmap[(unsigned char) *d++]) + range--; + + startpos += irange - range; + } + else /* Searching backwards. */ + { + register char c = (size1 == 0 || startpos >= size1 + ? string2[startpos - size1] + : string1[startpos]); + + if (!fastmap[(unsigned char) TRANSLATE (c)]) + goto advance; + } + } + + /* If can't match the null string, and that's all we have left, fail. */ + if (range >= 0 && startpos == total_size && fastmap + && !bufp->can_be_null) + return -1; + + val = re_match_2 (bufp, string1, size1, string2, size2, + startpos, regs, stop); + if (val >= 0) + return startpos; + + if (val == -2) + return -2; + + advance: + if (!range) + break; + else if (range > 0) + { + range--; + startpos++; + } + else + { + range++; + startpos--; + } + } + return -1; +} /* re_search_2 */ + +/* Declarations and macros for re_match_2. */ + +static int bcmp_translate (); +static boolean alt_match_null_string_p (), + common_op_match_null_string_p (), + group_match_null_string_p (); + +/* Structure for per-register (a.k.a. per-group) information. + This must not be longer than one word, because we push this value + onto the failure stack. Other register information, such as the + starting and ending positions (which are addresses), and the list of + inner groups (which is a bits list) are maintained in separate + variables. + + We are making a (strictly speaking) nonportable assumption here: that + the compiler will pack our bit fields into something that fits into + the type of `word', i.e., is something that fits into one item on the + failure stack. */ +typedef union +{ + fail_stack_elt_t word; + struct + { + /* This field is one if this group can match the empty string, + zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ +#define MATCH_NULL_UNSET_VALUE 3 + unsigned match_null_string_p : 2; + unsigned is_active : 1; + unsigned matched_something : 1; + unsigned ever_matched_something : 1; + } bits; +} register_info_type; + +#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) +#define IS_ACTIVE(R) ((R).bits.is_active) +#define MATCHED_SOMETHING(R) ((R).bits.matched_something) +#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) + + +/* Call this when have matched a real character; it sets `matched' flags + for the subexpressions which we are currently inside. Also records + that those subexprs have matched. */ +#define SET_REGS_MATCHED() \ + do \ + { \ + unsigned r; \ + for (r = lowest_active_reg; r <= highest_active_reg; r++) \ + { \ + MATCHED_SOMETHING (reg_info[r]) \ + = EVER_MATCHED_SOMETHING (reg_info[r]) \ + = 1; \ + } \ + } \ + while (0) + + +/* This converts PTR, a pointer into one of the search strings `string1' + and `string2' into an offset from the beginning of that string. */ +#define POINTER_TO_OFFSET(ptr) \ + (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) + +/* Registers are set to a sentinel when they haven't yet matched. */ +#define REG_UNSET_VALUE ((char *) -1) +#define REG_UNSET(e) ((e) == REG_UNSET_VALUE) + + +/* Macros for dealing with the split strings in re_match_2. */ + +#define MATCHING_IN_FIRST_STRING (dend == end_match_1) + +/* Call before fetching a character with *d. This switches over to + string2 if necessary. */ +#define PREFETCH() \ + while (d == dend) \ + { \ + /* End of string2 => fail. */ \ + if (dend == end_match_2) \ + goto fail; \ + /* End of string1 => advance to string2. */ \ + d = string2; \ + dend = end_match_2; \ + } + + +/* Test if at very beginning or at very end of the virtual concatenation + of `string1' and `string2'. If only one string, it's `string2'. */ +#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) +#define AT_STRINGS_END(d) ((d) == end2) + + +/* Test if D points to a character which is word-constituent. We have + two special cases to check for: if past the end of string1, look at + the first character in string2; and if before the beginning of + string2, look at the last character in string1. */ +#define WORDCHAR_P(d) \ + (SYNTAX ((d) == end1 ? *string2 \ + : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ + == Sword) + +/* Test if the character before D and the one at D differ with respect + to being word-constituent. */ +#define AT_WORD_BOUNDARY(d) \ + (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ + || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) + + +/* Free everything we malloc. */ +#ifdef REGEX_MALLOC +#define FREE_VAR(var) if (var) free (var); var = NULL +#define FREE_VARIABLES() \ + do { \ + FREE_VAR (fail_stack.stack); \ + FREE_VAR (regstart); \ + FREE_VAR (regend); \ + FREE_VAR (old_regstart); \ + FREE_VAR (old_regend); \ + FREE_VAR (best_regstart); \ + FREE_VAR (best_regend); \ + FREE_VAR (reg_info); \ + FREE_VAR (reg_dummy); \ + FREE_VAR (reg_info_dummy); \ + } while (0) +#else /* not REGEX_MALLOC */ +/* Some MIPS systems (at least) want this to free alloca'd storage. */ +#define FREE_VARIABLES() alloca (0) +#endif /* not REGEX_MALLOC */ + + +/* These values must meet several constraints. They must not be valid + register values; since we have a limit of 255 registers (because + we use only one byte in the pattern for the register number), we can + use numbers larger than 255. They must differ by 1, because of + NUM_FAILURE_ITEMS above. And the value for the lowest register must + be larger than the value for the highest register, so we do not try + to actually save any registers when none are active. */ +#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) +#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) + +/* Matching routines. */ + +#ifndef emacs /* Emacs never uses this. */ +/* re_match is like re_match_2 except it takes only a single string. */ + +int +re_match (bufp, string, size, pos, regs) + struct re_pattern_buffer *bufp; + const char *string; + int size, pos; + struct re_registers *regs; + { + return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); +} +#endif /* not emacs */ + + +/* re_match_2 matches the compiled pattern in BUFP against the + the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 + and SIZE2, respectively). We start matching at POS, and stop + matching at STOP. + + If REGS is non-null and the `no_sub' field of BUFP is nonzero, we + store offsets for the substring each group matched in REGS. See the + documentation for exactly how many groups we fill. + + We return -1 if no match, -2 if an internal error (such as the + failure stack overflowing). Otherwise, we return the length of the + matched substring. */ + +int +re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int size1, size2; + int pos; + struct re_registers *regs; + int stop; +{ + /* General temporaries. */ + int mcnt; + unsigned char *p1; + + /* Just past the end of the corresponding string. */ + const char *end1, *end2; + + /* Pointers into string1 and string2, just past the last characters in + each to consider matching. */ + const char *end_match_1, *end_match_2; + + /* Where we are in the data, and the end of the current string. */ + const char *d, *dend; + + /* Where we are in the pattern, and the end of the pattern. */ + unsigned char *p = bufp->buffer; + register unsigned char *pend = p + bufp->used; + + /* We use this to map every character in the string. */ + char *translate = bufp->translate; + + /* Failure point stack. Each place that can handle a failure further + down the line pushes a failure point on this stack. It consists of + restart, regend, and reg_info for all registers corresponding to + the subexpressions we're currently inside, plus the number of such + registers, and, finally, two char *'s. The first char * is where + to resume scanning the pattern; the second one is where to resume + scanning the strings. If the latter is zero, the failure point is + a ``dummy''; if a failure happens and the failure point is a dummy, + it gets discarded and the next next one is tried. */ + fail_stack_type fail_stack; +#ifdef DEBUG + static unsigned failure_id = 0; + unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; +#endif + + /* We fill all the registers internally, independent of what we + return, for use in backreferences. The number here includes + an element for register zero. */ + unsigned num_regs = bufp->re_nsub + 1; + + /* The currently active registers. */ + unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; + unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; + + /* Information on the contents of registers. These are pointers into + the input strings; they record just what was matched (on this + attempt) by a subexpression part of the pattern, that is, the + regnum-th regstart pointer points to where in the pattern we began + matching and the regnum-th regend points to right after where we + stopped matching the regnum-th subexpression. (The zeroth register + keeps track of what the whole pattern matches.) */ + const char **regstart, **regend; + + /* If a group that's operated upon by a repetition operator fails to + match anything, then the register for its start will need to be + restored because it will have been set to wherever in the string we + are when we last see its open-group operator. Similarly for a + register's end. */ + const char **old_regstart, **old_regend; + + /* The is_active field of reg_info helps us keep track of which (possibly + nested) subexpressions we are currently in. The matched_something + field of reg_info[reg_num] helps us tell whether or not we have + matched any of the pattern so far this time through the reg_num-th + subexpression. These two fields get reset each time through any + loop their register is in. */ + register_info_type *reg_info; + + /* The following record the register info as found in the above + variables when we find a match better than any we've seen before. + This happens as we backtrack through the failure points, which in + turn happens only if we have not yet matched the entire string. */ + unsigned best_regs_set = false; + const char **best_regstart, **best_regend; + + /* Logically, this is `best_regend[0]'. But we don't want to have to + allocate space for that if we're not allocating space for anything + else (see below). Also, we never need info about register 0 for + any of the other register vectors, and it seems rather a kludge to + treat `best_regend' differently than the rest. So we keep track of + the end of the best match so far in a separate variable. We + initialize this to NULL so that when we backtrack the first time + and need to test it, it's not garbage. */ + const char *match_end = NULL; + + /* Used when we pop values we don't care about. */ + const char **reg_dummy; + register_info_type *reg_info_dummy; + +#ifdef DEBUG + /* Counts the total number of registers pushed. */ + unsigned num_regs_pushed = 0; +#endif + + DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); + + INIT_FAIL_STACK (); + + /* Do not bother to initialize all the register variables if there are + no groups in the pattern, as it takes a fair amount of time. If + there are groups, we include space for register 0 (the whole + pattern), even though we never use it, since it simplifies the + array indexing. We should fix this. */ + if (bufp->re_nsub) + { + regstart = REGEX_TALLOC (num_regs, const char *); + regend = REGEX_TALLOC (num_regs, const char *); + old_regstart = REGEX_TALLOC (num_regs, const char *); + old_regend = REGEX_TALLOC (num_regs, const char *); + best_regstart = REGEX_TALLOC (num_regs, const char *); + best_regend = REGEX_TALLOC (num_regs, const char *); + reg_info = REGEX_TALLOC (num_regs, register_info_type); + reg_dummy = REGEX_TALLOC (num_regs, const char *); + reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); + + if (!(regstart && regend && old_regstart && old_regend && reg_info + && best_regstart && best_regend && reg_dummy && reg_info_dummy)) + { + FREE_VARIABLES (); + return -2; + } + } +#ifdef REGEX_MALLOC + else + { + /* We must initialize all our variables to NULL, so that + `FREE_VARIABLES' doesn't try to free them. */ + regstart = regend = old_regstart = old_regend = best_regstart + = best_regend = reg_dummy = NULL; + reg_info = reg_info_dummy = (register_info_type *) NULL; + } +#endif /* REGEX_MALLOC */ + + /* The starting position is bogus. */ + if (pos < 0 || pos > size1 + size2) + { + FREE_VARIABLES (); + return -1; + } + + /* Initialize subexpression text positions to -1 to mark ones that no + start_memory/stop_memory has been seen for. Also initialize the + register information struct. */ + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + regstart[mcnt] = regend[mcnt] + = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; + + REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; + IS_ACTIVE (reg_info[mcnt]) = 0; + MATCHED_SOMETHING (reg_info[mcnt]) = 0; + EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; + } + + /* We move `string1' into `string2' if the latter's empty -- but not if + `string1' is null. */ + if (size2 == 0 && string1 != NULL) + { + string2 = string1; + size2 = size1; + string1 = 0; + size1 = 0; + } + end1 = string1 + size1; + end2 = string2 + size2; + + /* Compute where to stop matching, within the two strings. */ + if (stop <= size1) + { + end_match_1 = string1 + stop; + end_match_2 = string2; + } + else + { + end_match_1 = end1; + end_match_2 = string2 + stop - size1; + } + + /* `p' scans through the pattern as `d' scans through the data. + `dend' is the end of the input string that `d' points within. `d' + is advanced into the following input string whenever necessary, but + this happens before fetching; therefore, at the beginning of the + loop, `d' can be pointing at the end of a string, but it cannot + equal `string2'. */ + if (size1 > 0 && pos <= size1) + { + d = string1 + pos; + dend = end_match_1; + } + else + { + d = string2 + pos - size1; + dend = end_match_2; + } + + DEBUG_PRINT1 ("The compiled pattern is: "); + DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); + DEBUG_PRINT1 ("The string to match is: `"); + DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); + DEBUG_PRINT1 ("'\n"); + + /* This loops over pattern commands. It exits by returning from the + function if the match is complete, or it drops through if the match + fails at this starting point in the input data. */ + for (;;) + { + DEBUG_PRINT2 ("\n0x%x: ", p); + + if (p == pend) + { /* End of pattern means we might have succeeded. */ + DEBUG_PRINT1 ("end of pattern ... "); + + /* If we haven't matched the entire string, and we want the + longest match, try backtracking. */ + if (d != end_match_2) + { + DEBUG_PRINT1 ("backtracking.\n"); + + if (!FAIL_STACK_EMPTY ()) + { /* More failure points to try. */ + boolean same_str_p = (FIRST_STRING_P (match_end) + == MATCHING_IN_FIRST_STRING); + + /* If exceeds best match so far, save it. */ + if (!best_regs_set + || (same_str_p && d > match_end) + || (!same_str_p && !MATCHING_IN_FIRST_STRING)) + { + best_regs_set = true; + match_end = d; + + DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); + + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + best_regstart[mcnt] = regstart[mcnt]; + best_regend[mcnt] = regend[mcnt]; + } + } + goto fail; + } + + /* If no failure points, don't restore garbage. */ + else if (best_regs_set) + { + restore_best_regs: + /* Restore best match. It may happen that `dend == + end_match_1' while the restored d is in string2. + For example, the pattern `x.*y.*z' against the + strings `x-' and `y-z-', if the two strings are + not consecutive in memory. */ + DEBUG_PRINT1 ("Restoring best registers.\n"); + + d = match_end; + dend = ((d >= string1 && d <= end1) + ? end_match_1 : end_match_2); + + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + regstart[mcnt] = best_regstart[mcnt]; + regend[mcnt] = best_regend[mcnt]; + } + } + } /* d != end_match_2 */ + + DEBUG_PRINT1 ("Accepting match.\n"); + + /* If caller wants register contents data back, do it. */ + if (regs && !bufp->no_sub) + { + /* Have the register data arrays been allocated? */ + if (bufp->regs_allocated == REGS_UNALLOCATED) + { /* No. So allocate them with malloc. We need one + extra element beyond `num_regs' for the `-1' marker + GNU code uses. */ + regs->num_regs = MAX (RE_NREGS, num_regs + 1); + regs->start = TALLOC (regs->num_regs, regoff_t); + regs->end = TALLOC (regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) + return -2; + bufp->regs_allocated = REGS_REALLOCATE; + } + else if (bufp->regs_allocated == REGS_REALLOCATE) + { /* Yes. If we need more elements than were already + allocated, reallocate them. If we need fewer, just + leave it alone. */ + if (regs->num_regs < num_regs + 1) + { + regs->num_regs = num_regs + 1; + RETALLOC (regs->start, regs->num_regs, regoff_t); + RETALLOC (regs->end, regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) + return -2; + } + } + else + assert (bufp->regs_allocated == REGS_FIXED); + + /* Convert the pointer data in `regstart' and `regend' to + indices. Register zero has to be set differently, + since we haven't kept track of any info for it. */ + if (regs->num_regs > 0) + { + regs->start[0] = pos; + regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 + : d - string2 + size1); + } + + /* Go through the first `min (num_regs, regs->num_regs)' + registers, since that is all we initialized. */ + for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) + { + if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) + regs->start[mcnt] = regs->end[mcnt] = -1; + else + { + regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); + regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); + } + } + + /* If the regs structure we return has more elements than + were in the pattern, set the extra elements to -1. If + we (re)allocated the registers, this is the case, + because we always allocate enough to have at least one + -1 at the end. */ + for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) + regs->start[mcnt] = regs->end[mcnt] = -1; + } /* regs && !bufp->no_sub */ + + FREE_VARIABLES (); + DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", + nfailure_points_pushed, nfailure_points_popped, + nfailure_points_pushed - nfailure_points_popped); + DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); + + mcnt = d - pos - (MATCHING_IN_FIRST_STRING + ? string1 + : string2 - size1); + + DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); + + return mcnt; + } + + /* Otherwise match next pattern command. */ +#ifdef SWITCH_ENUM_BUG + switch ((int) ((re_opcode_t) *p++)) +#else + switch ((re_opcode_t) *p++) +#endif + { + /* Ignore these. Used to ignore the n of succeed_n's which + currently have n == 0. */ + case no_op: + DEBUG_PRINT1 ("EXECUTING no_op.\n"); + break; + + + /* Match the next n pattern characters exactly. The following + byte in the pattern defines n, and the n bytes after that + are the characters to match. */ + case exactn: + mcnt = *p++; + DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); + + /* This is written out as an if-else so we don't waste time + testing `translate' inside the loop. */ + if (translate) + { + do + { + PREFETCH (); + if (translate[(unsigned char) *d++] != (char) *p++) + goto fail; + } + while (--mcnt); + } + else + { + do + { + PREFETCH (); + if (*d++ != (char) *p++) goto fail; + } + while (--mcnt); + } + SET_REGS_MATCHED (); + break; + + + /* Match any character except possibly a newline or a null. */ + case anychar: + DEBUG_PRINT1 ("EXECUTING anychar.\n"); + + PREFETCH (); + + if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') + || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) + goto fail; + + SET_REGS_MATCHED (); + DEBUG_PRINT2 (" Matched `%d'.\n", *d); + d++; + break; + + + case charset: + case charset_not: + { + register unsigned char c; + boolean not = (re_opcode_t) *(p - 1) == charset_not; + + DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); + + PREFETCH (); + c = TRANSLATE (*d); /* The character to match. */ + + /* Cast to `unsigned' instead of `unsigned char' in case the + bit list is a full 32 bytes long. */ + if (c < (unsigned) (*p * BYTEWIDTH) + && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; + + p += 1 + *p; + + if (!not) goto fail; + + SET_REGS_MATCHED (); + d++; + break; + } + + + /* The beginning of a group is represented by start_memory. + The arguments are the register number in the next byte, and the + number of groups inner to this one in the next. The text + matched within the group is recorded (in the internal + registers data structure) under the register number. */ + case start_memory: + DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); + + /* Find out if this group can match the empty string. */ + p1 = p; /* To send to group_match_null_string_p. */ + + if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[*p]) + = group_match_null_string_p (&p1, pend, reg_info); + + /* Save the position in the string where we were the last time + we were at this open-group operator in case the group is + operated upon by a repetition operator, e.g., with `(a*)*b' + against `ab'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regstart[*p]) ? d : regstart[*p] + : regstart[*p]; + DEBUG_PRINT2 (" old_regstart: %d\n", + POINTER_TO_OFFSET (old_regstart[*p])); + + regstart[*p] = d; + DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); + + IS_ACTIVE (reg_info[*p]) = 1; + MATCHED_SOMETHING (reg_info[*p]) = 0; + + /* This is the new highest active register. */ + highest_active_reg = *p; + + /* If nothing was active before, this is the new lowest active + register. */ + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *p; + + /* Move past the register number and inner group count. */ + p += 2; + break; + + + /* The stop_memory opcode represents the end of a group. Its + arguments are the same as start_memory's: the register + number, and the number of inner groups. */ + case stop_memory: + DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); + + /* We need to save the string position the last time we were at + this close-group operator in case the group is operated + upon by a repetition operator, e.g., with `((a*)*(b*)*)*' + against `aba'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regend[*p]) ? d : regend[*p] + : regend[*p]; + DEBUG_PRINT2 (" old_regend: %d\n", + POINTER_TO_OFFSET (old_regend[*p])); + + regend[*p] = d; + DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); + + /* This register isn't active anymore. */ + IS_ACTIVE (reg_info[*p]) = 0; + + /* If this was the only register active, nothing is active + anymore. */ + if (lowest_active_reg == highest_active_reg) + { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } + else + { /* We must scan for the new highest active register, since + it isn't necessarily one less than now: consider + (a(b)c(d(e)f)g). When group 3 ends, after the f), the + new highest active register is 1. */ + unsigned char r = *p - 1; + while (r > 0 && !IS_ACTIVE (reg_info[r])) + r--; + + /* If we end up at register zero, that means that we saved + the registers as the result of an `on_failure_jump', not + a `start_memory', and we jumped to past the innermost + `stop_memory'. For example, in ((.)*) we save + registers 1 and 2 as a result of the *, but when we pop + back to the second ), we are at the stop_memory 1. + Thus, nothing is active. */ + if (r == 0) + { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } + else + highest_active_reg = r; + } + + /* If just failed to match something this time around with a + group that's operated on by a repetition operator, try to + force exit from the ``loop'', and restore the register + information for this group that we had before trying this + last match. */ + if ((!MATCHED_SOMETHING (reg_info[*p]) + || (re_opcode_t) p[-3] == start_memory) + && (p + 2) < pend) + { + boolean is_a_jump_n = false; + + p1 = p + 2; + mcnt = 0; + switch ((re_opcode_t) *p1++) + { + case jump_n: + is_a_jump_n = true; + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if (is_a_jump_n) + p1 += 2; + break; + + default: + /* do nothing */ ; + } + p1 += mcnt; + + /* If the next operation is a jump backwards in the pattern + to an on_failure_jump right before the start_memory + corresponding to this stop_memory, exit from the loop + by forcing a failure after pushing on the stack the + on_failure_jump's jump in the pattern, and d. */ + if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump + && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) + { + /* If this group ever matched anything, then restore + what its registers were before trying this last + failed match, e.g., with `(a*)*b' against `ab' for + regstart[1], and, e.g., with `((a*)*(b*)*)*' + against `aba' for regend[3]. + + Also restore the registers for inner groups for, + e.g., `((a*)(b*))*' against `aba' (register 3 would + otherwise get trashed). */ + + if (EVER_MATCHED_SOMETHING (reg_info[*p])) + { + unsigned r; + + EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; + + /* Restore this and inner groups' (if any) registers. */ + for (r = *p; r < *p + *(p + 1); r++) + { + regstart[r] = old_regstart[r]; + + /* xx why this test? */ + if ((int) old_regend[r] >= (int) regstart[r]) + regend[r] = old_regend[r]; + } + } + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + PUSH_FAILURE_POINT (p1 + mcnt, d, -2); + + goto fail; + } + } + + /* Move past the register number and the inner group count. */ + p += 2; + break; + + + /* \ has been turned into a `duplicate' command which is + followed by the numeric value of as the register number. */ + case duplicate: + { + register const char *d2, *dend2; + int regno = *p++; /* Get which register to match against. */ + DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); + + /* Can't back reference a group which we've never matched. */ + if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) + goto fail; + + /* Where in input to try to start matching. */ + d2 = regstart[regno]; + + /* Where to stop matching; if both the place to start and + the place to stop matching are in the same string, then + set to the place to stop, otherwise, for now have to use + the end of the first string. */ + + dend2 = ((FIRST_STRING_P (regstart[regno]) + == FIRST_STRING_P (regend[regno])) + ? regend[regno] : end_match_1); + for (;;) + { + /* If necessary, advance to next segment in register + contents. */ + while (d2 == dend2) + { + if (dend2 == end_match_2) break; + if (dend2 == regend[regno]) break; + + /* End of string1 => advance to string2. */ + d2 = string2; + dend2 = regend[regno]; + } + /* At end of register contents => success */ + if (d2 == dend2) break; + + /* If necessary, advance to next segment in data. */ + PREFETCH (); + + /* How many characters left in this segment to match. */ + mcnt = dend - d; + + /* Want how many consecutive characters we can match in + one shot, so, if necessary, adjust the count. */ + if (mcnt > dend2 - d2) + mcnt = dend2 - d2; + + /* Compare that many; failure if mismatch, else move + past them. */ + if (translate + ? bcmp_translate (d, d2, mcnt, translate) + : bcmp (d, d2, mcnt)) + goto fail; + d += mcnt, d2 += mcnt; + } + } + break; + + + /* begline matches the empty string at the beginning of the string + (unless `not_bol' is set in `bufp'), and, if + `newline_anchor' is set, after newlines. */ + case begline: + DEBUG_PRINT1 ("EXECUTING begline.\n"); + + if (AT_STRINGS_BEG (d)) + { + if (!bufp->not_bol) break; + } + else if (d[-1] == '\n' && bufp->newline_anchor) + { + break; + } + /* In all other cases, we fail. */ + goto fail; + + + /* endline is the dual of begline. */ + case endline: + DEBUG_PRINT1 ("EXECUTING endline.\n"); + + if (AT_STRINGS_END (d)) + { + if (!bufp->not_eol) break; + } + + /* We have to ``prefetch'' the next character. */ + else if ((d == end1 ? *string2 : *d) == '\n' + && bufp->newline_anchor) + { + break; + } + goto fail; + + + /* Match at the very beginning of the data. */ + case begbuf: + DEBUG_PRINT1 ("EXECUTING begbuf.\n"); + if (AT_STRINGS_BEG (d)) + break; + goto fail; + + + /* Match at the very end of the data. */ + case endbuf: + DEBUG_PRINT1 ("EXECUTING endbuf.\n"); + if (AT_STRINGS_END (d)) + break; + goto fail; + + + /* on_failure_keep_string_jump is used to optimize `.*\n'. It + pushes NULL as the value for the string on the stack. Then + `pop_failure_point' will keep the current value for the + string, instead of restoring it. To see why, consider + matching `foo\nbar' against `.*\n'. The .* matches the foo; + then the . fails against the \n. But the next thing we want + to do is match the \n against the \n; if we restored the + string value, we would be back at the foo. + + Because this is used only in specific cases, we don't need to + check all the things that `on_failure_jump' does, to make + sure the right things get saved on the stack. Hence we don't + share its code. The only reason to push anything on the + stack at all is that otherwise we would have to change + `anychar's code to do something besides goto fail in this + case; that seems worse than this. */ + case on_failure_keep_string_jump: + DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); + + PUSH_FAILURE_POINT (p + mcnt, NULL, -2); + break; + + + /* Uses of on_failure_jump: + + Each alternative starts with an on_failure_jump that points + to the beginning of the next alternative. Each alternative + except the last ends with a jump that in effect jumps past + the rest of the alternatives. (They really jump to the + ending jump of the following alternative, because tensioning + these jumps is a hassle.) + + Repeats start with an on_failure_jump that points past both + the repetition text and either the following jump or + pop_failure_jump back to this on_failure_jump. */ + case on_failure_jump: + on_failure: + DEBUG_PRINT1 ("EXECUTING on_failure_jump"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); + + /* If this on_failure_jump comes right before a group (i.e., + the original * applied to a group), save the information + for that group and all inner ones, so that if we fail back + to this point, the group's information will be correct. + For example, in \(a*\)*\1, we need the preceding group, + and in \(\(a*\)b*\)\2, we need the inner group. */ + + /* We can't use `p' to check ahead because we push + a failure point to `p + mcnt' after we do this. */ + p1 = p; + + /* We need to skip no_op's before we look for the + start_memory in case this on_failure_jump is happening as + the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 + against aba. */ + while (p1 < pend && (re_opcode_t) *p1 == no_op) + p1++; + + if (p1 < pend && (re_opcode_t) *p1 == start_memory) + { + /* We have a new highest active register now. This will + get reset at the start_memory we are about to get to, + but we will have saved all the registers relevant to + this repetition op, as described above. */ + highest_active_reg = *(p1 + 1) + *(p1 + 2); + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *(p1 + 1); + } + + DEBUG_PRINT1 (":\n"); + PUSH_FAILURE_POINT (p + mcnt, d, -2); + break; + + + /* A smart repeat ends with `maybe_pop_jump'. + We change it to either `pop_failure_jump' or `jump'. */ + case maybe_pop_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); + { + register unsigned char *p2 = p; + + /* Compare the beginning of the repeat with what in the + pattern follows its end. If we can establish that there + is nothing that they would both match, i.e., that we + would have to backtrack because of (as in, e.g., `a*a') + then we can change to pop_failure_jump, because we'll + never have to backtrack. + + This is not true in the case of alternatives: in + `(a|ab)*' we do need to backtrack to the `ab' alternative + (e.g., if the string was `ab'). But instead of trying to + detect that here, the alternative has put on a dummy + failure point which is what we will end up popping. */ + + /* Skip over open/close-group commands. */ + while (p2 + 2 < pend + && ((re_opcode_t) *p2 == stop_memory + || (re_opcode_t) *p2 == start_memory)) + p2 += 3; /* Skip over args, too. */ + + /* If we're at the end of the pattern, we can change. */ + if (p2 == pend) + { + /* Consider what happens when matching ":\(.*\)" + against ":/". I don't really understand this code + yet. */ + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 + (" End of pattern: change to `pop_failure_jump'.\n"); + } + + else if ((re_opcode_t) *p2 == exactn + || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) + { + register unsigned char c + = *p2 == (unsigned char) endline ? '\n' : p2[2]; + p1 = p + mcnt; + + /* p1[0] ... p1[2] are the `on_failure_jump' corresponding + to the `maybe_finalize_jump' of this case. Examine what + follows. */ + if ((re_opcode_t) p1[3] == exactn && p1[5] != c) + { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", + c, p1[5]); + } + + else if ((re_opcode_t) p1[3] == charset + || (re_opcode_t) p1[3] == charset_not) + { + int not = (re_opcode_t) p1[3] == charset_not; + + if (c < (unsigned char) (p1[4] * BYTEWIDTH) + && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; + + /* `not' is equal to 1 if c would match, which means + that we can't change to pop_failure_jump. */ + if (!not) + { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); + } + } + } + } + p -= 2; /* Point at relative address again. */ + if ((re_opcode_t) p[-1] != pop_failure_jump) + { + p[-1] = (unsigned char) jump; + DEBUG_PRINT1 (" Match => jump.\n"); + goto unconditional_jump; + } + /* Note fall through. */ + + + /* The end of a simple repeat has a pop_failure_jump back to + its matching on_failure_jump, where the latter will push a + failure point. The pop_failure_jump takes off failure + points put on by this pop_failure_jump's matching + on_failure_jump; we got through the pattern to here from the + matching on_failure_jump, so didn't fail. */ + case pop_failure_jump: + { + /* We need to pass separate storage for the lowest and + highest registers, even though we don't care about the + actual values. Otherwise, we will restore only one + register from the stack, since lowest will == highest in + `pop_failure_point'. */ + unsigned dummy_low_reg, dummy_high_reg; + unsigned char *pdummy; + const char *sdummy; + + DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); + POP_FAILURE_POINT (sdummy, pdummy, + dummy_low_reg, dummy_high_reg, + reg_dummy, reg_dummy, reg_info_dummy); + } + /* Note fall through. */ + + + /* Unconditionally jump (without popping any failure points). */ + case jump: + unconditional_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ + DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); + p += mcnt; /* Do the jump. */ + DEBUG_PRINT2 ("(to 0x%x).\n", p); + break; + + + /* We need this opcode so we can detect where alternatives end + in `group_match_null_string_p' et al. */ + case jump_past_alt: + DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); + goto unconditional_jump; + + + /* Normally, the on_failure_jump pushes a failure point, which + then gets popped at pop_failure_jump. We will end up at + pop_failure_jump, also, and with a pattern of, say, `a+', we + are skipping over the on_failure_jump, so we have to push + something meaningless for pop_failure_jump to pop. */ + case dummy_failure_jump: + DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); + /* It doesn't matter what we push for the string here. What + the code at `fail' tests is the value for the pattern. */ + PUSH_FAILURE_POINT (0, 0, -2); + goto unconditional_jump; + + + /* At the end of an alternative, we need to push a dummy failure + point in case we are followed by a `pop_failure_jump', because + we don't want the failure point for the alternative to be + popped. For example, matching `(a|ab)*' against `aab' + requires that we match the `ab' alternative. */ + case push_dummy_failure: + DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); + /* See comments just above at `dummy_failure_jump' about the + two zeroes. */ + PUSH_FAILURE_POINT (0, 0, -2); + break; + + /* Have to succeed matching what follows at least n times. + After that, handle like `on_failure_jump'. */ + case succeed_n: + EXTRACT_NUMBER (mcnt, p + 2); + DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); + + assert (mcnt >= 0); + /* Originally, this is how many times we HAVE to succeed. */ + if (mcnt > 0) + { + mcnt--; + p += 2; + STORE_NUMBER_AND_INCR (p, mcnt); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); + } + else if (mcnt == 0) + { + DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); + p[2] = (unsigned char) no_op; + p[3] = (unsigned char) no_op; + goto on_failure; + } + break; + + case jump_n: + EXTRACT_NUMBER (mcnt, p + 2); + DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); + + /* Originally, this is how many times we CAN jump. */ + if (mcnt) + { + mcnt--; + STORE_NUMBER (p + 2, mcnt); + goto unconditional_jump; + } + /* If don't have to jump any more, skip over the rest of command. */ + else + p += 4; + break; + + case set_number_at: + { + DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + p1 = p + mcnt; + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); + STORE_NUMBER (p1, mcnt); + break; + } + + case wordbound: + DEBUG_PRINT1 ("EXECUTING wordbound.\n"); + if (AT_WORD_BOUNDARY (d)) + break; + goto fail; + + case notwordbound: + DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); + if (AT_WORD_BOUNDARY (d)) + goto fail; + break; + + case wordbeg: + DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); + if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) + break; + goto fail; + + case wordend: + DEBUG_PRINT1 ("EXECUTING wordend.\n"); + if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) + && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) + break; + goto fail; + +#ifdef emacs +#ifdef emacs19 + case before_dot: + DEBUG_PRINT1 ("EXECUTING before_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) >= point) + goto fail; + break; + + case at_dot: + DEBUG_PRINT1 ("EXECUTING at_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) != point) + goto fail; + break; + + case after_dot: + DEBUG_PRINT1 ("EXECUTING after_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) <= point) + goto fail; + break; +#else /* not emacs19 */ + case at_dot: + DEBUG_PRINT1 ("EXECUTING at_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point) + goto fail; + break; +#endif /* not emacs19 */ + + case syntaxspec: + DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchsyntax; + + case wordchar: + DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); + mcnt = (int) Sword; + matchsyntax: + PREFETCH (); + if (SYNTAX (*d++) != (enum syntaxcode) mcnt) + goto fail; + SET_REGS_MATCHED (); + break; + + case notsyntaxspec: + DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchnotsyntax; + + case notwordchar: + DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); + mcnt = (int) Sword; + matchnotsyntax: + PREFETCH (); + if (SYNTAX (*d++) == (enum syntaxcode) mcnt) + goto fail; + SET_REGS_MATCHED (); + break; + +#else /* not emacs */ + case wordchar: + DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); + PREFETCH (); + if (!WORDCHAR_P (d)) + goto fail; + SET_REGS_MATCHED (); + d++; + break; + + case notwordchar: + DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); + PREFETCH (); + if (WORDCHAR_P (d)) + goto fail; + SET_REGS_MATCHED (); + d++; + break; +#endif /* not emacs */ + + default: + abort (); + } + continue; /* Successfully executed one pattern command; keep going. */ + + + /* We goto here if a matching operation fails. */ + fail: + if (!FAIL_STACK_EMPTY ()) + { /* A restart point is known. Restore to that state. */ + DEBUG_PRINT1 ("\nFAIL:\n"); + POP_FAILURE_POINT (d, p, + lowest_active_reg, highest_active_reg, + regstart, regend, reg_info); + + /* If this failure point is a dummy, try the next one. */ + if (!p) + goto fail; + + /* If we failed to the end of the pattern, don't examine *p. */ + assert (p <= pend); + if (p < pend) + { + boolean is_a_jump_n = false; + + /* If failed to a backwards jump that's part of a repetition + loop, need to pop this failure point and use the next one. */ + switch ((re_opcode_t) *p) + { + case jump_n: + is_a_jump_n = true; + case maybe_pop_jump: + case pop_failure_jump: + case jump: + p1 = p + 1; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + + if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) + || (!is_a_jump_n + && (re_opcode_t) *p1 == on_failure_jump)) + goto fail; + break; + default: + /* do nothing */ ; + } + } + + if (d >= string1 && d <= end1) + dend = end_match_1; + } + else + break; /* Matching at this starting point really fails. */ + } /* for (;;) */ + + if (best_regs_set) + goto restore_best_regs; + + FREE_VARIABLES (); + + return -1; /* Failure to match. */ +} /* re_match_2 */ + +/* Subroutine definitions for re_match_2. */ + + +/* We are passed P pointing to a register number after a start_memory. + + Return true if the pattern up to the corresponding stop_memory can + match the empty string, and false otherwise. + + If we find the matching stop_memory, sets P to point to one past its number. + Otherwise, sets P to an undefined byte less than or equal to END. + + We don't handle duplicates properly (yet). */ + +static boolean +group_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; +{ + int mcnt; + /* Point to after the args to the start_memory. */ + unsigned char *p1 = *p + 2; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and return true or + false, as appropriate, when we get to one that can't, or to the + matching stop_memory. */ + + switch ((re_opcode_t) *p1) + { + /* Could be either a loop or a series of alternatives. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + /* If the next operation is not a jump backwards in the + pattern. */ + + if (mcnt >= 0) + { + /* Go through the on_failure_jumps of the alternatives, + seeing if any of the alternatives cannot match nothing. + The last alternative starts with only a jump, + whereas the rest start with on_failure_jump and end + with a jump, e.g., here is the pattern for `a|b|c': + + /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 + /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 + /exactn/1/c + + So, we have to first go through the first (n-1) + alternatives and then deal with the last one separately. */ + + + /* Deal with the first (n-1) alternatives, which start + with an on_failure_jump (see above) that jumps to right + past a jump_past_alt. */ + + while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) + { + /* `mcnt' holds how many bytes long the alternative + is, including the ending `jump_past_alt' and + its number. */ + + if (!alt_match_null_string_p (p1, p1 + mcnt - 3, + reg_info)) + return false; + + /* Move to right after this alternative, including the + jump_past_alt. */ + p1 += mcnt; + + /* Break if it's the beginning of an n-th alternative + that doesn't begin with an on_failure_jump. */ + if ((re_opcode_t) *p1 != on_failure_jump) + break; + + /* Still have to check that it's not an n-th + alternative that starts with an on_failure_jump. */ + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) + { + /* Get to the beginning of the n-th alternative. */ + p1 -= 3; + break; + } + } + + /* Deal with the last alternative: go back and get number + of the `jump_past_alt' just before it. `mcnt' contains + the length of the alternative. */ + EXTRACT_NUMBER (mcnt, p1 - 2); + + if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) + return false; + + p1 += mcnt; /* Get past the n-th alternative. */ + } /* if mcnt > 0 */ + break; + + + case stop_memory: + assert (p1[1] == **p); + *p = p1 + 2; + return true; + + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return false; +} /* group_match_null_string_p */ + + +/* Similar to group_match_null_string_p, but doesn't deal with alternatives: + It expects P to be the first byte of a single alternative and END one + byte past the last. The alternative can contain groups. */ + +static boolean +alt_match_null_string_p (p, end, reg_info) + unsigned char *p, *end; + register_info_type *reg_info; +{ + int mcnt; + unsigned char *p1 = p; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and break when we get + to one that can't. */ + + switch ((re_opcode_t) *p1) + { + /* It's a loop. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + break; + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return true; +} /* alt_match_null_string_p */ + + +/* Deals with the ops common to group_match_null_string_p and + alt_match_null_string_p. + + Sets P to one after the op and its arguments, if any. */ + +static boolean +common_op_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; +{ + int mcnt; + boolean ret; + int reg_no; + unsigned char *p1 = *p; + + switch ((re_opcode_t) *p1++) + { + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbeg: + case wordend: + case wordbound: + case notwordbound: +#ifdef emacs + case before_dot: + case at_dot: + case after_dot: +#endif + break; + + case start_memory: + reg_no = *p1; + assert (reg_no > 0 && reg_no <= MAX_REGNUM); + ret = group_match_null_string_p (&p1, end, reg_info); + + /* Have to set this here in case we're checking a group which + contains a group and a back reference to it. */ + + if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; + + if (!ret) + return false; + break; + + /* If this is an optimized succeed_n for zero times, make the jump. */ + case jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if (mcnt >= 0) + p1 += mcnt; + else + return false; + break; + + case succeed_n: + /* Get to the number of times to succeed. */ + p1 += 2; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + if (mcnt == 0) + { + p1 -= 4; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + } + else + return false; + break; + + case duplicate: + if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) + return false; + break; + + case set_number_at: + p1 += 4; + + default: + /* All other opcodes mean we cannot match the empty string. */ + return false; + } + + *p = p1; + return true; +} /* common_op_match_null_string_p */ + + +/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN + bytes; nonzero otherwise. */ + +static int +bcmp_translate (s1, s2, len, translate) + unsigned char *s1, *s2; + register int len; + char *translate; +{ + register unsigned char *p1 = s1, *p2 = s2; + while (len) + { + if (translate[*p1++] != translate[*p2++]) return 1; + len--; + } + return 0; +} + +/* Entry points for GNU code. */ + +/* re_compile_pattern is the GNU regular expression compiler: it + compiles PATTERN (of length SIZE) and puts the result in BUFP. + Returns 0 if the pattern was valid, otherwise an error string. + + Assumes the `allocated' (and perhaps `buffer') and `translate' fields + are set in BUFP on entry. + + We call regex_compile to do the actual compilation. */ + +const char * +re_compile_pattern (pattern, length, bufp) + const char *pattern; + int length; + struct re_pattern_buffer *bufp; +{ + reg_errcode_t ret; + + /* GNU code is written to assume at least RE_NREGS registers will be set + (and at least one extra will be -1). */ + bufp->regs_allocated = REGS_UNALLOCATED; + + /* And GNU code determines whether or not to get register information + by passing null for the REGS argument to re_match, etc., not by + setting no_sub. */ + bufp->no_sub = 0; + + /* Match anchors at newline. */ + bufp->newline_anchor = 1; + + ret = regex_compile (pattern, length, re_syntax_options, bufp); + + return re_error_msg[(int) ret]; +} + +/* Entry points compatible with 4.2 BSD regex library. We don't define + them if this is an Emacs or POSIX compilation. */ + +#if !defined (emacs) && !defined (_POSIX_SOURCE) + +/* BSD has one and only one pattern buffer. */ +static struct re_pattern_buffer re_comp_buf; + +char * +re_comp (s) + const char *s; +{ + reg_errcode_t ret; + + if (!s) + { + if (!re_comp_buf.buffer) + return "No previous regular expression"; + return 0; + } + + if (!re_comp_buf.buffer) + { + re_comp_buf.buffer = (unsigned char *) malloc (200); + if (re_comp_buf.buffer == NULL) + return "Memory exhausted"; + re_comp_buf.allocated = 200; + + re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); + if (re_comp_buf.fastmap == NULL) + return "Memory exhausted"; + } + + /* Since `re_exec' always passes NULL for the `regs' argument, we + don't need to initialize the pattern buffer fields which affect it. */ + + /* Match anchors at newlines. */ + re_comp_buf.newline_anchor = 1; + + ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); + + /* Yes, we're discarding `const' here. */ + return (char *) re_error_msg[(int) ret]; +} + + +int +re_exec (s) + const char *s; +{ + const int len = strlen (s); + return + 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); +} +#endif /* not emacs and not _POSIX_SOURCE */ + +/* POSIX.2 functions. Don't define these for Emacs. */ + +#ifndef emacs + +/* regcomp takes a regular expression as a string and compiles it. + + PREG is a regex_t *. We do not expect any fields to be initialized, + since POSIX says we shouldn't. Thus, we set + + `buffer' to the compiled pattern; + `used' to the length of the compiled pattern; + `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + REG_EXTENDED bit in CFLAGS is set; otherwise, to + RE_SYNTAX_POSIX_BASIC; + `newline_anchor' to REG_NEWLINE being set in CFLAGS; + `fastmap' and `fastmap_accurate' to zero; + `re_nsub' to the number of subexpressions in PATTERN. + + PATTERN is the address of the pattern string. + + CFLAGS is a series of bits which affect compilation. + + If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we + use POSIX basic syntax. + + If REG_NEWLINE is set, then . and [^...] don't match newline. + Also, regexec will try a match beginning after every newline. + + If REG_ICASE is set, then we considers upper- and lowercase + versions of letters to be equivalent when matching. + + If REG_NOSUB is set, then when PREG is passed to regexec, that + routine will report only success or failure, and nothing about the + registers. + + It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for + the return codes and their meanings.) */ + +int +regcomp (preg, pattern, cflags) + regex_t *preg; + const char *pattern; + int cflags; +{ + reg_errcode_t ret; + unsigned syntax + = (cflags & REG_EXTENDED) ? + RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; + + /* regex_compile will allocate the space for the compiled pattern. */ + preg->buffer = 0; + preg->allocated = 0; + + /* Don't bother to use a fastmap when searching. This simplifies the + REG_NEWLINE case: if we used a fastmap, we'd have to put all the + characters after newlines into the fastmap. This way, we just try + every character. */ + preg->fastmap = 0; + + if (cflags & REG_ICASE) + { + unsigned i; + + preg->translate = (char *) malloc (CHAR_SET_SIZE); + if (preg->translate == NULL) + return (int) REG_ESPACE; + + /* Map uppercase characters to corresponding lowercase ones. */ + for (i = 0; i < CHAR_SET_SIZE; i++) + preg->translate[i] = ISUPPER (i) ? tolower (i) : i; + } + else + preg->translate = NULL; + + /* If REG_NEWLINE is set, newlines are treated differently. */ + if (cflags & REG_NEWLINE) + { /* REG_NEWLINE implies neither . nor [^...] match newline. */ + syntax &= ~RE_DOT_NEWLINE; + syntax |= RE_HAT_LISTS_NOT_NEWLINE; + /* It also changes the matching behavior. */ + preg->newline_anchor = 1; + } + else + preg->newline_anchor = 0; + + preg->no_sub = !!(cflags & REG_NOSUB); + + /* POSIX says a null character in the pattern terminates it, so we + can use strlen here in compiling the pattern. */ + ret = regex_compile (pattern, strlen (pattern), syntax, preg); + + /* POSIX doesn't distinguish between an unmatched open-group and an + unmatched close-group: both are REG_EPAREN. */ + if (ret == REG_ERPAREN) ret = REG_EPAREN; + + return (int) ret; +} + + +/* regexec searches for a given pattern, specified by PREG, in the + string STRING. + + If NMATCH is zero or REG_NOSUB was set in the cflags argument to + `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at + least NMATCH elements, and we set them to the offsets of the + corresponding matched substrings. + + EFLAGS specifies `execution flags' which affect matching: if + REG_NOTBOL is set, then ^ does not match at the beginning of the + string; if REG_NOTEOL is set, then $ does not match at the end. + + We return 0 if we find a match and REG_NOMATCH if not. */ + +int +regexec (preg, string, nmatch, pmatch, eflags) + const regex_t *preg; + const char *string; + size_t nmatch; + regmatch_t pmatch[]; + int eflags; +{ + int ret; + struct re_registers regs; + regex_t private_preg; + int len = strlen (string); + boolean want_reg_info = !preg->no_sub && nmatch > 0; + + private_preg = *preg; + + private_preg.not_bol = !!(eflags & REG_NOTBOL); + private_preg.not_eol = !!(eflags & REG_NOTEOL); + + /* The user has told us exactly how many registers to return + information about, via `nmatch'. We have to pass that on to the + matching routines. */ + private_preg.regs_allocated = REGS_FIXED; + + if (want_reg_info) + { + regs.num_regs = nmatch; + regs.start = TALLOC (nmatch, regoff_t); + regs.end = TALLOC (nmatch, regoff_t); + if (regs.start == NULL || regs.end == NULL) + return (int) REG_NOMATCH; + } + + /* Perform the searching operation. */ + ret = re_search (&private_preg, string, len, + /* start: */ 0, /* range: */ len, + want_reg_info ? ®s : (struct re_registers *) 0); + + /* Copy the register information to the POSIX structure. */ + if (want_reg_info) + { + if (ret >= 0) + { + unsigned r; + + for (r = 0; r < nmatch; r++) + { + pmatch[r].rm_so = regs.start[r]; + pmatch[r].rm_eo = regs.end[r]; + } + } + + /* If we needed the temporary register info, free the space now. */ + free (regs.start); + free (regs.end); + } + + /* We want zero return to mean success, unlike `re_search'. */ + return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; +} + + +/* Returns a message corresponding to an error code, ERRCODE, returned + from either regcomp or regexec. We don't use PREG here. */ + +size_t +regerror (errcode_v, preg, errbuf, errbuf_size) + int errcode_v; + const regex_t *preg; + char *errbuf; + size_t errbuf_size; +{ + const char *msg; + size_t msg_size; + + if (errcode_v < 0 + || errcode_v >= (sizeof (re_error_msg) / sizeof (re_error_msg[0]))) + /* Only error codes returned by the rest of the code should be passed + to this routine. If we are given anything else, or if other regex + code generates an invalid error code, then the program has a bug. + Dump core so we can fix it. */ + abort (); + + msg = re_error_msg[errcode_v]; + + /* POSIX doesn't require that we do anything in this case, but why + not be nice. */ + if (! msg) + msg = "Success"; + + msg_size = strlen (msg) + 1; /* Includes the null. */ + + if (errbuf_size != 0) + { + if (msg_size > errbuf_size) + { + strncpy (errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = 0; + } + else + strcpy (errbuf, msg); + } + + return msg_size; +} + + +/* Free dynamically allocated space used by PREG. */ + +void +regfree (preg) + regex_t *preg; +{ + if (preg->buffer != NULL) + free (preg->buffer); + preg->buffer = NULL; + + preg->allocated = 0; + preg->used = 0; + + if (preg->fastmap != NULL) + free (preg->fastmap); + preg->fastmap = NULL; + preg->fastmap_accurate = 0; + + if (preg->translate != NULL) + free (preg->translate); + preg->translate = NULL; +} + +#endif /* not emacs */ + +/* +Local variables: +make-backup-files: t +version-control: t +trim-versions-without-asking: nil +End: +*/ Index: branches/apertium-tagger/apertium2/apertium/win32/regex.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/regex.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/regex.h (revision 69632) @@ -0,0 +1,498 @@ +/* Definitions for data structures and routines for the regular + expression library, version 0.12. + + Copyright (C) 1985, 1989, 1990, 1991, 1992, 1993 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#ifndef __REGEXP_LIBRARY_H__ +#define __REGEXP_LIBRARY_H__ + +#ifdef __cplusplus + extern "C" { +#endif + +/* POSIX says that must be included (by the caller) before + . */ + +#ifdef VMS +/* VMS doesn't have `size_t' in , even though POSIX says it + should be there. */ +#include +#endif + + +/* The following bits are used to determine the regexp syntax we + recognize. The set/not-set meanings are chosen so that Emacs syntax + remains the value 0. The bits are given in alphabetical order, and + the definitions shifted by one from the previous bit; thus, when we + add or remove a bit, only one other definition need change. */ +typedef unsigned reg_syntax_t; + +/* If this bit is not set, then \ inside a bracket expression is literal. + If set, then such a \ quotes the following character. */ +#define RE_BACKSLASH_ESCAPE_IN_LISTS (1) + +/* If this bit is not set, then + and ? are operators, and \+ and \? are + literals. + If set, then \+ and \? are operators and + and ? are literals. */ +#define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) + +/* If this bit is set, then character classes are supported. They are: + [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], + [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. + If not set, then character classes are not supported. */ +#define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) + +/* If this bit is set, then ^ and $ are always anchors (outside bracket + expressions, of course). + If this bit is not set, then it depends: + ^ is an anchor if it is at the beginning of a regular + expression or after an open-group or an alternation operator; + $ is an anchor if it is at the end of a regular expression, or + before a close-group or an alternation operator. + + This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because + POSIX draft 11.2 says that * etc. in leading positions is undefined. + We already implemented a previous draft which made those constructs + invalid, though, so we haven't changed the code back. */ +#define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) + +/* If this bit is set, then special characters are always special + regardless of where they are in the pattern. + If this bit is not set, then special characters are special only in + some contexts; otherwise they are ordinary. Specifically, + * + ? and intervals are only special when not after the beginning, + open-group, or alternation operator. */ +#define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) + +/* If this bit is set, then *, +, ?, and { cannot be first in an re or + immediately after an alternation or begin-group operator. */ +#define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) + +/* If this bit is set, then . matches newline. + If not set, then it doesn't. */ +#define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) + +/* If this bit is set, then . doesn't match NUL. + If not set, then it does. */ +#define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) + +/* If this bit is set, nonmatching lists [^...] do not match newline. + If not set, they do. */ +#define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) + +/* If this bit is set, either \{...\} or {...} defines an + interval, depending on RE_NO_BK_BRACES. + If not set, \{, \}, {, and } are literals. */ +#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) + +/* If this bit is set, +, ? and | aren't recognized as operators. + If not set, they are. */ +#define RE_LIMITED_OPS (RE_INTERVALS << 1) + +/* If this bit is set, newline is an alternation operator. + If not set, newline is literal. */ +#define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) + +/* If this bit is set, then `{...}' defines an interval, and \{ and \} + are literals. + If not set, then `\{...\}' defines an interval. */ +#define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) + +/* If this bit is set, (...) defines a group, and \( and \) are literals. + If not set, \(...\) defines a group, and ( and ) are literals. */ +#define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) + +/* If this bit is set, then \ matches . + If not set, then \ is a back-reference. */ +#define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) + +/* If this bit is set, then | is an alternation operator, and \| is literal. + If not set, then \| is an alternation operator, and | is literal. */ +#define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) + +/* If this bit is set, then an ending range point collating higher + than the starting range point, as in [z-a], is invalid. + If not set, then when ending range point collates higher than the + starting range point, the range is ignored. */ +#define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) + +/* If this bit is set, then an unmatched ) is ordinary. + If not set, then an unmatched ) is invalid. */ +#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) + +/* This global variable defines the particular regexp syntax to use (for + some interfaces). When a regexp is compiled, the syntax used is + stored in the pattern buffer, so changing this does not affect + already-compiled regexps. */ +extern reg_syntax_t re_syntax_options; + +/* Define combinations of the above bits for the standard possibilities. + (The [[[ comments delimit what gets put into the Texinfo file, so + don't delete them!) */ +/* [[[begin syntaxes]]] */ +#define RE_SYNTAX_EMACS 0 + +#define RE_SYNTAX_AWK \ + (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +#define RE_SYNTAX_POSIX_AWK \ + (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) + +#define RE_SYNTAX_GREP \ + (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ + | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ + | RE_NEWLINE_ALT) + +#define RE_SYNTAX_EGREP \ + (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ + | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ + | RE_NO_BK_VBAR) + +#define RE_SYNTAX_POSIX_EGREP \ + (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) + +/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ +#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC + +#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC + +/* Syntax bits common to both basic and extended POSIX regex syntax. */ +#define _RE_SYNTAX_POSIX_COMMON \ + (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ + | RE_INTERVALS | RE_NO_EMPTY_RANGES) + +#define RE_SYNTAX_POSIX_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) + +/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes + RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this + isn't minimal, since other operators, such as \`, aren't disabled. */ +#define RE_SYNTAX_POSIX_MINIMAL_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) + +#define RE_SYNTAX_POSIX_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS + replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ +#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) +/* [[[end syntaxes]]] */ + +/* Maximum number of duplicates an interval can allow. Some systems + (erroneously) define this in other header files, but we want our + value, so remove any previous define. */ +#ifdef RE_DUP_MAX +#undef RE_DUP_MAX +#endif +#define RE_DUP_MAX ((1 << 15) - 1) + + +/* POSIX `cflags' bits (i.e., information for `regcomp'). */ + +/* If this bit is set, then use extended regular expression syntax. + If not set, then use basic regular expression syntax. */ +#define REG_EXTENDED 1 + +/* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ +#define REG_ICASE (REG_EXTENDED << 1) + +/* If this bit is set, then anchors do not match at newline + characters in the string. + If not set, then anchors do match at newlines. */ +#define REG_NEWLINE (REG_ICASE << 1) + +/* If this bit is set, then report only success or fail in regexec. + If not set, then returns differ between not matching and errors. */ +#define REG_NOSUB (REG_NEWLINE << 1) + + +/* POSIX `eflags' bits (i.e., information for regexec). */ + +/* If this bit is set, then the beginning-of-line operator doesn't match + the beginning of the string (presumably because it's not the + beginning of a line). + If not set, then the beginning-of-line operator does match the + beginning of the string. */ +#define REG_NOTBOL 1 + +/* Like REG_NOTBOL, except for the end-of-line. */ +#define REG_NOTEOL (1 << 1) + + +/* If any error codes are removed, changed, or added, update the + `re_error_msg' table in regex.c. */ +typedef enum +{ + REG_NOERROR = 0, /* Success. */ + REG_NOMATCH, /* Didn't find a match (for regexec). */ + + /* POSIX regcomp return error codes. (In the order listed in the + standard.) */ + REG_BADPAT, /* Invalid pattern. */ + REG_ECOLLATE, /* Not implemented. */ + REG_ECTYPE, /* Invalid character class name. */ + REG_EESCAPE, /* Trailing backslash. */ + REG_ESUBREG, /* Invalid back reference. */ + REG_EBRACK, /* Unmatched left bracket. */ + REG_EPAREN, /* Parenthesis imbalance. */ + REG_EBRACE, /* Unmatched \{. */ + REG_BADBR, /* Invalid contents of \{\}. */ + REG_ERANGE, /* Invalid range end. */ + REG_ESPACE, /* Ran out of memory. */ + REG_BADRPT, /* No preceding re for repetition op. */ + + /* Error codes we've added. */ + REG_EEND, /* Premature end. */ + REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ + REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ +} reg_errcode_t; + +/* This data structure represents a compiled pattern. Before calling + the pattern compiler, the fields `buffer', `allocated', `fastmap', + `translate', and `no_sub' can be set. After the pattern has been + compiled, the `re_nsub' field is available. All other fields are + private to the regex routines. */ + +struct re_pattern_buffer +{ +/* [[[begin pattern_buffer]]] */ + /* Space that holds the compiled pattern. It is declared as + `unsigned char *' because its elements are + sometimes used as array indexes. */ + unsigned char *buffer; + + /* Number of bytes to which `buffer' points. */ + unsigned long allocated; + + /* Number of bytes actually used in `buffer'. */ + unsigned long used; + + /* Syntax setting with which the pattern was compiled. */ + reg_syntax_t syntax; + + /* Pointer to a fastmap, if any, otherwise zero. re_search uses + the fastmap, if there is one, to skip over impossible + starting points for matches. */ + char *fastmap; + + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation + is applied to a pattern when it is compiled and to a string + when it is matched. */ + char *translate; + + /* Number of subexpressions found by the compiler. */ + size_t re_nsub; + + /* Zero if this pattern cannot match the empty string, one else. + Well, in truth it's used only in `re_search_2', to see + whether or not we should use the fastmap, so we don't set + this absolutely perfectly; see `re_compile_fastmap' (the + `duplicate' case). */ + unsigned can_be_null : 1; + + /* If REGS_UNALLOCATED, allocate space in the `regs' structure + for `max (RE_NREGS, re_nsub + 1)' groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ +#define REGS_UNALLOCATED 0 +#define REGS_REALLOCATE 1 +#define REGS_FIXED 2 + unsigned regs_allocated : 2; + + /* Set to zero when `regex_compile' compiles a pattern; set to one + by `re_compile_fastmap' if it updates the fastmap. */ + unsigned fastmap_accurate : 1; + + /* If set, `re_match_2' does not return information about + subexpressions. */ + unsigned no_sub : 1; + + /* If set, a beginning-of-line anchor doesn't match at the + beginning of the string. */ + unsigned not_bol : 1; + + /* Similarly for an end-of-line anchor. */ + unsigned not_eol : 1; + + /* If true, an anchor at a newline matches. */ + unsigned newline_anchor : 1; + +/* [[[end pattern_buffer]]] */ +}; + +typedef struct re_pattern_buffer regex_t; + + +/* search.c (search_buffer) in Emacs needs this one opcode value. It is + defined both in `regex.c' and here. */ +#define RE_EXACTN_VALUE 1 + +/* Type for byte offsets within the string. POSIX mandates this. */ +typedef int regoff_t; + + +/* This is the structure we store register match data in. See + regex.texinfo for a full description of what registers match. */ +struct re_registers +{ + unsigned num_regs; + regoff_t *start; + regoff_t *end; +}; + + +/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, + `re_match_2' returns information about at least this many registers + the first time a `regs' structure is passed. */ +#ifndef RE_NREGS +#define RE_NREGS 30 +#endif + + +/* POSIX specification for registers. Aside from the different names than + `re_registers', POSIX uses an array of structures, instead of a + structure of arrays. */ +typedef struct +{ + regoff_t rm_so; /* Byte offset from string's start to substring's start. */ + regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ +} regmatch_t; + +/* Declarations for routines. */ + +/* To avoid duplicating every routine declaration -- once with a + prototype (if we are ANSI), and once without (if we aren't) -- we + use the following macro to declare argument types. This + unfortunately clutters up the declarations a bit, but I think it's + worth it. */ + +#if __STDC__ + +#define _RE_ARGS(args) args + +#else /* not __STDC__ */ + +#define _RE_ARGS(args) () + +#endif /* not __STDC__ */ + +/* Sets the current default syntax to SYNTAX, and return the old syntax. + You can also simply assign to the `re_syntax_options' variable. */ +extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); + +/* Compile the regular expression PATTERN, with length LENGTH + and syntax given by the global `re_syntax_options', into the buffer + BUFFER. Return NULL if successful, and an error string if not. */ +extern const char *re_compile_pattern + _RE_ARGS ((const char *pattern, int length, + struct re_pattern_buffer *buffer)); + + +/* Compile a fastmap for the compiled pattern in BUFFER; used to + accelerate searches. Return 0 if successful and -2 if was an + internal error. */ +extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); + + +/* Search in the string STRING (with length LENGTH) for the pattern + compiled into BUFFER. Start searching at position START, for RANGE + characters. Return the starting position of the match, -1 for no + match, or -2 for an internal error. Also return register + information in REGS (if REGS and BUFFER->no_sub are nonzero). */ +extern int re_search + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, int range, struct re_registers *regs)); + + +/* Like `re_search', but search in the concatenation of STRING1 and + STRING2. Also, stop searching at index START + STOP. */ +extern int re_search_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, int range, struct re_registers *regs, int stop)); + + +/* Like `re_search', but return how many characters in STRING the regexp + in BUFFER matched, starting at position START. */ +extern int re_match + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, struct re_registers *regs)); + + +/* Relates to `re_match' as `re_search_2' relates to `re_search'. */ +extern int re_match_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, struct re_registers *regs, int stop)); + + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using BUFFER and REGS will use this memory + for recording register information. STARTS and ENDS must be + allocated with malloc, and must each be at least `NUM_REGS * sizeof + (regoff_t)' bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ +extern void re_set_registers + _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, + unsigned num_regs, regoff_t *starts, regoff_t *ends)); + +/* 4.2 bsd compatibility. */ +extern char *re_comp _RE_ARGS ((const char *)); +extern int re_exec _RE_ARGS ((const char *)); + +/* POSIX compatibility. */ +extern int regcomp _RE_ARGS ((regex_t *preg, const char *pattern, int cflags)); +extern int regexec + _RE_ARGS ((const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags)); +extern size_t regerror + _RE_ARGS ((int errcode, const regex_t *preg, char *errbuf, + size_t errbuf_size)); +extern void regfree _RE_ARGS ((regex_t *preg)); + +#ifdef __cplusplus + } +#endif + +#endif /* not __REGEXP_LIBRARY_H__ */ + +/* +Local variables: +make-backup-files: t +version-control: t +trim-versions-without-asking: nil +End: +*/ Index: branches/apertium-tagger/apertium2/apertium/win32/runner_skeleton.c =================================================================== --- branches/apertium-tagger/apertium2/apertium/win32/runner_skeleton.c (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/win32/runner_skeleton.c (revision 69632) @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include +#include +#include + +#define PATH_BUF_SIZE 8191 +#define NUM_EXEC_ARGS 3 +#define ARG_BUF_SIZE (8191 - NUM_EXEC_ARGS) +#define ENV_VAR_SIZE 32768 + +/* Strip the last component off a pathname. + Thus, parent("a\b\c") -> "a\b" */ +char* parent(char* parent_buf) { + char* pos = strrchr(parent_buf, '\\'); + pos[0] = '\0'; + + return parent_buf; +} + +/* Remove the .exe if the user invoked this executable with its extension. + That is, if the user typed something like apertium.exe instead of apertium. */ +char* remove_extension(char* buf) { + char* pos = strrchr(buf, '.'); + + if (pos != NULL && strcmp(pos, ".exe") == 0) { + pos[0] = '\0'; + } + + return buf; +} + +#define MIN(x, y) ((x) < (y) ? x : y) + +int main(int argc, char* argv[]) { + char *args[ARG_BUF_SIZE]; + char base_path[PATH_BUF_SIZE + 1]; + char script_path[PATH_BUF_SIZE + 1]; + char shell_path[PATH_BUF_SIZE + 1]; + char env_path[ENV_VAR_SIZE]; + int argi; + + _fullpath(shell_path, argv[0], PATH_BUF_SIZE); + strcpy(script_path, shell_path); + strcpy(base_path, shell_path); + + parent(shell_path); + strcat(shell_path, "\\sh.exe"); + + remove_extension(script_path); + parent(base_path); + + args[0] = shell_path; + args[1] = "--norc"; + args[2] = script_path; + + /* Any parameters passed on the command line will be passed through to the shell script */ + for (argi = 0; argi < MIN(argc - 1, ARG_BUF_SIZE); argi++) { + printf("%s\n", argv[argi + 1]); + args[argi + NUM_EXEC_ARGS] = argv[argi + 1]; + } + /* Signal the end of the argument list */ + args[argi + NUM_EXEC_ARGS] = NULL; + + /* Add this executable's directory to the path */ + strcpy(env_path, "PATH="); + strcat(env_path, getenv("PATH")); + strcat(env_path, ";"); + strcat(env_path, base_path); + _putenv(env_path); + + _spawnv(_P_WAIT, args[0], &args[1]); + + _flushall(); +} Index: branches/apertium-tagger/apertium2/apertium/serialiser.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/serialiser.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/serialiser.h (revision 69632) @@ -0,0 +1,288 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef SERIALISER_H +#define SERIALISER_H + +#include "a.h" +#include "basic_exception_type.h" +#include "analysis.h" +#include "exception.h" +#include "i.h" +#include "lemma.h" +#include "morpheme.h" +#include "tag.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +namespace { +template +static unsigned char compressedSize(const SerialisedType &SerialisedType_) { + unsigned char compressedSize_ = 0; + + for (; (SerialisedType_ >> + std::numeric_limits::digits * compressedSize_) != 0; + ++compressedSize_) { + } + + return compressedSize_; +} + +template class Serialiser; + +template <> class Serialiser { +public: + inline static void serialise(const a &SerialisedType_, std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const Analysis &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const i &SerialisedType_, std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const Lemma &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const Morpheme &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const Tag &SerialisedType_, + std::ostream &Output); +}; + +template +class Serialiser > { +public: + inline static void + serialise(const std::basic_string &SerialisedType_, + std::ostream &Output); +}; + +template +class Serialiser > { +public: + inline static void + serialise(const std::map &SerialisedType_, + std::ostream &Output); +}; + +template +class Serialiser > { +public: + inline static void + serialise(const std::pair &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const std::size_t &SerialisedType_, + std::ostream &Output); +}; + +template class Serialiser > { +public: + inline static void serialise(const std::vector &SerialisedType_, + std::ostream &Output); +}; + +template <> class Serialiser { +public: + inline static void serialise(const wchar_t &SerialisedType_, + std::ostream &Output); +}; +} + +template +inline void serialise(const SerialisedType &SerialisedType_, + std::ostream &Output) { + Serialiser::serialise(SerialisedType_, Output); +} + +void Serialiser::serialise(const a &SerialisedType_, std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheTags, Output); + ::Apertium::serialise(SerialisedType_.TheMorphemes, Output); +} + +void Serialiser::serialise(const Analysis &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheMorphemes, Output); +} + +void Serialiser::serialise(const i &SerialisedType_, std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheTags, Output); +} + +void Serialiser::serialise(const Lemma &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheLemma, Output); +} + +void Serialiser::serialise(const Morpheme &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheLemma, Output); + ::Apertium::serialise(SerialisedType_.TheTags, Output); +} + +void Serialiser::serialise(const Tag &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.TheTag, Output); +} + +template +void Serialiser >::serialise( + const std::basic_string &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.size(), Output); + + for (typename std::basic_string::const_iterator + SerialisedType_iterator = SerialisedType_.begin(); + // Call .end() each iteration to save memory. + SerialisedType_iterator != SerialisedType_.end(); + ++SerialisedType_iterator) { + ::Apertium::serialise(*SerialisedType_iterator, Output); + } +} + +template +void Serialiser >::serialise( + const std::map &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.size(), Output); + + for (typename std::map::const_iterator + SerialisedType_iterator = SerialisedType_.begin(); + // Call .end() each iteration to save memory. + SerialisedType_iterator != SerialisedType_.end(); + ++SerialisedType_iterator) { + ::Apertium::serialise(*SerialisedType_iterator, Output); + } +} + +template +void Serialiser >::serialise( + const std::pair &SerialisedType_, + std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.first, Output); + ::Apertium::serialise(SerialisedType_.second, Output); +} + +void Serialiser::serialise(const std::size_t &SerialisedType_, + std::ostream &Output) { + try { + Output.put(compressedSize(SerialisedType_)); + + if (!Output) { + std::stringstream what_; + what_ << "can't serialise size " << std::hex + << /* [1] */ +compressedSize(SerialisedType_) << std::dec; + throw Exception::Serialiser::not_Stream_good(what_); + } + + for (unsigned char CompressedSize = compressedSize(SerialisedType_); + CompressedSize != 0; Output.put(static_cast( + SerialisedType_ >> + std::numeric_limits::digits * --CompressedSize))) { + if (!Output) { + std::stringstream what_; + what_ << "can't serialise byte " << std::hex + << /* [1] */ +static_cast( + SerialisedType_ >> + std::numeric_limits::digits * + CompressedSize) << std::dec; + throw Exception::Serialiser::not_Stream_good(what_); + } + } + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't serialise const std::size_t & : " + << basic_ExceptionType_.what(); + throw Exception::Serialiser::size_t_(what_); + } +} + +template +void Serialiser >::serialise( + const std::vector &SerialisedType_, std::ostream &Output) { + ::Apertium::serialise(SerialisedType_.size(), Output); + + for (typename std::vector::const_iterator value_type_ = + SerialisedType_.begin(); + // Call .end() each iteration to save memory. + value_type_ != SerialisedType_.end(); ++value_type_) { + ::Apertium::serialise(*value_type_, Output); + } +} + +void Serialiser::serialise(const wchar_t &SerialisedType_, + std::ostream &Output) { + try { + Output.put(compressedSize(SerialisedType_)); + + if (!Output) { + std::stringstream what_; + what_ << "can't serialise size " << std::hex + << /* [1] */ +compressedSize(SerialisedType_); + throw Exception::Serialiser::not_Stream_good(what_); + } + + for (unsigned char CompressedSize = compressedSize(SerialisedType_); + CompressedSize != 0; Output.put(static_cast( + static_cast(SerialisedType_) >> + std::numeric_limits::digits * --CompressedSize))) { + if (!Output) { + std::stringstream what_; + what_ << "can't serialise byte " << std::hex + << /* [1] */ +(static_cast(SerialisedType_) >> + std::numeric_limits::digits * + CompressedSize); + throw Exception::Serialiser::not_Stream_good(what_); + } + } + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't serialise const wchar_t & : " + << basic_ExceptionType_.what(); + throw Exception::Serialiser::wchar_t_(what_); + } +} +} + +// [1] operator+ promotes its operand to a printable integral type. + +#endif // SERIALISER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.cc (revision 69632) @@ -0,0 +1,50 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_1_tagger_trainer.h" + +#include "analysis.h" +#include "basic_tagger.h" +#include "serialiser.h" + +#include +#include +#include +#include + +namespace Apertium { +Stream_5_3_1_TaggerTrainer::Stream_5_3_1_TaggerTrainer( + const basic_Tagger::Flags &Flags_) + : basic_5_3_1_Tagger(), basic_StreamTaggerTrainer(Flags_) {} + +void Stream_5_3_1_TaggerTrainer::serialise( + std::ostream &Serialised_basic_Tagger) const { + ::Apertium::serialise(Model, Serialised_basic_Tagger); +} + +void +Stream_5_3_1_TaggerTrainer::train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_) { + Model.insert(std::make_pair(Analysis_, 0)).first->second += Coefficient_; +} + +void Stream_5_3_1_TaggerTrainer::multiplyModel( + const std::size_t &OccurrenceCoefficientMultiplier) { + for (std::map::iterator Analysis_ = Model.begin(); + Analysis_ != Model.end(); ++Analysis_) { + Analysis_->second *= OccurrenceCoefficientMultiplier; + } +} +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.cc (revision 69632) @@ -0,0 +1,55 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_2_tagger_trainer.h" + +#include "a.h" +#include "analysis.h" +#include "lemma.h" +#include "serialiser.h" + +#include +#include +#include + +namespace Apertium { +Stream_5_3_2_TaggerTrainer::Stream_5_3_2_TaggerTrainer(const Flags &Flags_) + : basic_StreamTaggerTrainer(Flags_) {} + +void Stream_5_3_2_TaggerTrainer::serialise( + std::ostream &Serialised_basic_Tagger) const { + ::Apertium::serialise(Model, Serialised_basic_Tagger); +} + +void +Stream_5_3_2_TaggerTrainer::train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_) { + Model.insert(std::make_pair(static_cast(Analysis_), + std::map())) + .first->second.insert(std::make_pair(static_cast(Analysis_), 0)) + .first->second += Coefficient_; +} + +void Stream_5_3_2_TaggerTrainer::multiplyModel( + const std::size_t &OccurrenceCoefficientMultiplier) { + for (std::map >::iterator a_ = Model.begin(); + a_ != Model.end(); ++a_) { + for (std::map::iterator r_ = a_->second.begin(); + r_ != a_->second.end(); ++r_) { + r_->second *= OccurrenceCoefficientMultiplier; + } + } +} +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.cc (revision 69632) @@ -0,0 +1,88 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "analysis.h" +#include "i.h" +#include "lemma.h" +#include "serialiser.h" +#include "stream_5_3_3_tagger_trainer.h" + +#include +#include +#include +#include +#include + +namespace Apertium { +Stream_5_3_3_TaggerTrainer::Stream_5_3_3_TaggerTrainer(const Flags &Flags_) + : basic_StreamTaggerTrainer(Flags_) {} + +void Stream_5_3_3_TaggerTrainer::serialise( + std::ostream &Serialised_basic_Tagger) const { + ::Apertium::serialise(Model, Serialised_basic_Tagger); +} + +void +Stream_5_3_3_TaggerTrainer::train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_) { + Model.first.insert( + std::make_pair(i(Analysis_), std::map())) + .first->second.insert(std::make_pair(Lemma(Analysis_), 0)) + .first->second += Coefficient_; + + for (std::vector::const_iterator Morpheme_ = + Analysis_.TheMorphemes.begin() + 1; + Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) { + Model.second.first.insert(std::make_pair(i(*(Morpheme_ - 1)), + std::map())) + .first->second.insert(std::make_pair(Lemma(*Morpheme_), 0)) + .first->second += Coefficient_; + Model.second.second.insert(std::make_pair(Lemma(*Morpheme_), + std::map())) + .first->second.insert(std::make_pair(i(*Morpheme_), 0)) + .first->second += Coefficient_; + } +} + +void Stream_5_3_3_TaggerTrainer::multiplyModel( + const std::size_t &OccurrenceCoefficientMultiplier) { + for (std::map >::iterator i_ = + Model.first.begin(); + i_ != Model.first.end(); ++i_) { + for (std::map::iterator Lemma_ = i_->second.begin(); + Lemma_ != i_->second.end(); ++Lemma_) { + Lemma_->second *= OccurrenceCoefficientMultiplier; + } + } + + for (std::map >::iterator i_ = + Model.second.first.begin(); + i_ != Model.second.first.end(); ++i_) { + for (std::map::iterator Lemma_ = i_->second.begin(); + Lemma_ != i_->second.end(); ++Lemma_) { + Lemma_->second *= OccurrenceCoefficientMultiplier; + } + } + + for (std::map >::iterator Lemma_ = + Model.second.second.begin(); + Lemma_ != Model.second.second.end(); ++Lemma_) { + for (std::map::iterator i_ = Lemma_->second.begin(); + i_ != Lemma_->second.end(); ++i_) { + i_->second *= OccurrenceCoefficientMultiplier; + } + } +} +} Index: branches/apertium-tagger/apertium2/apertium/Makefile.am =================================================================== --- branches/apertium-tagger/apertium2/apertium/Makefile.am (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/Makefile.am (revision 69632) @@ -0,0 +1,651 @@ +AUTOMAKE_OPTIONS = subdir-objects + +h_sources = a.h \ + align.h \ + analysis.h \ + apertium_re.h \ + apertium_tagger.h \ + basic_5_3_1_tagger.h \ + basic_5_3_2_tagger.h \ + basic_5_3_3_tagger.h \ + basic_exception_type.h \ + basic_stream_tagger.h \ + basic_stream_tagger_trainer.h \ + basic_tagger.h \ + collection.h \ + constant_manager.h \ + constructor_eq_delete.h \ + deserialiser.h \ + endian_double_util.h \ + err_exception.h \ + exception.h \ + exception_type.h \ + file_tagger.h \ + hmm.h \ + i.h \ + interchunk.h \ + interchunk_word.h \ + latex_accentsmap.h \ + lemma.h \ + lexical_unit.h \ + linebreak.h \ + lswpost.h \ + morpheme.h \ + morpho_stream.h \ + optional.h \ + postchunk.h \ + serialiser.h \ + stream.h \ + stream_5_3_1_tagger.h \ + stream_5_3_2_tagger.h \ + stream_5_3_3_tagger.h \ + stream_5_3_1_tagger_trainer.h \ + stream_5_3_2_tagger_trainer.h \ + stream_5_3_3_tagger_trainer.h \ + streamed_type.h \ + string_utils.h \ + tag.h \ + tagger_data.h \ + tagger_data_hmm.h \ + tagger_data_lsw.h \ + tagger_utils.h \ + tagger_word.h \ + tmx_aligner_tool.h \ + tmx_alignment.h \ + tmx_align_parameters.h \ + tmx_arguments_parser.h \ + tmx_book_to_matrix.h \ + tmx_builder.h \ + tmx_dictionary.h \ + tmx_dic_tree.h \ + tmx_quasi_diagonal.h \ + tmx_serialize_impl.h \ + tmx_strings_and_streams.h \ + tmx_trail_postprocessors.h \ + tmx_translate.h \ + tmx_words.h \ + transfer_data.h \ + transfer.h \ + transfer_instr.h \ + transfer_mult.h \ + transfer_token.h \ + transfer_word.h \ + transfer_word_list.h \ + trx_reader.h \ + tsx_reader.h \ + ttag.h \ + unlocked_cstdio.h \ + utf_converter.h \ + wchar_t_exception.h \ + wchar_t_exception_type.h + +#DEPR.: +# lextor_data.h +# lextor_eval.h +# lextor.h +# lextor_word.h + +cc_sources = a.cc \ + align.cc \ + analysis.cc \ + apertium_re.cc \ + basic_5_3_1_tagger.cc \ + basic_5_3_2_tagger.cc \ + basic_exception_type.cc \ + basic_stream_tagger.cc \ + basic_stream_tagger_trainer.cc \ + basic_tagger.cc \ + collection.cc \ + constant_manager.cc \ + endian_double_util.cc \ + exception_type.cc \ + file_tagger.cc \ + hmm.cc \ + i.cc \ + interchunk.cc \ + interchunk_word.cc \ + latex_accentsmap.cc \ + lemma.cc \ + linebreak.cc \ + lswpost.cc \ + morpheme.cc \ + morpho_stream.cc \ + postchunk.cc \ + stream.cc \ + stream_5_3_1_tagger.cc \ + stream_5_3_2_tagger.cc \ + stream_5_3_3_tagger.cc \ + stream_5_3_1_tagger_trainer.cc \ + stream_5_3_2_tagger_trainer.cc \ + stream_5_3_3_tagger_trainer.cc \ + string_utils.cc \ + tag.cc \ + tagger_data.cc \ + tagger_data_hmm.cc \ + tagger_data_lsw.cc \ + tagger_utils.cc \ + tagger_word.cc \ + tmx_aligner_tool.cc \ + tmx_alignment.cc \ + tmx_arguments_parser.cc \ + tmx_book_to_matrix.cc \ + tmx_builder.cc \ + tmx_dictionary.cc \ + tmx_strings_and_streams.cc \ + tmx_trail_postprocessors.cc \ + tmx_translate.cc \ + transfer.cc \ + transfer_data.cc \ + transfer_instr.cc \ + transfer_mult.cc \ + transfer_token.cc \ + transfer_word.cc \ + transfer_word_list.cc \ + trx_reader.cc \ + tsx_reader.cc \ + utf_converter.cc \ + wchar_t_exception_type.cc +#DEPR.: +# lextor.cc +# lextor_data.cc +# lextor_eval.cc +# lextor_word.cc + +library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME) +library_include_HEADERS = $(h_sources) + +GENERATEDSCRIPTS = apertium-gen-deformat apertium-gen-reformat \ + apertium-validate-tagger \ + apertium-validate-transfer apertium-validate-dictionary \ + apertium-validate-modes \ + apertium-validate-interchunk \ + apertium-validate-postchunk apertium apertium-unformat \ + apertium-gen-modes apertium-validate-acx \ + apertium-utils-fixlatex +#DEPR.: + #apertium-preprocess-corpus-lextor + #apertium-gen-stopwords-lextor + #apertium-gen-lextorbil + #apertium-gen-lextormono apertium-gen-wlist-lextor + +lib_LTLIBRARIES = libapertium3.la +libapertium3_la_SOURCES = $(h_sources) $(cc_sources) +libapertium3_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION) -release $(GENERIC_RELEASE) + +bin_PROGRAMS = apertium-deshtml \ + apertium-deslatex \ + apertium-desmediawiki \ + apertium-desodt \ + apertium-despptx \ + apertium-desrtf \ + apertium-destxt \ + apertium-deswxml \ + apertium-desxlsx \ + apertium-desxpresstag \ + apertium-filter-ambiguity \ + apertium-interchunk \ + apertium-multiple-translations \ + apertium-postchunk \ + apertium-postlatex \ + apertium-postlatex-raw \ + apertium-prelatex \ + apertium-preprocess-transfer \ + apertium-pretransfer \ + apertium-rehtml \ + apertium-rehtml-noent \ + apertium-relatex \ + apertium-remediawiki \ + apertium-reodt \ + apertium-repptx \ + apertium-rertf \ + apertium-retxt \ + apertium-rewxml \ + apertium-rexlsx \ + apertium-rexpresstag \ + apertium-tagger \ + apertium-tagger-apply-new-rules \ + apertium-tagger-readwords \ + apertium-tmxbuild \ + apertium-transfer + +bin_SCRIPTS = $(GENERATEDSCRIPTS) + +instdir = apertium + +apertiumdir = $(prefix)/share/apertium +apertiuminclude = $(prefix)/include/apertium-$(GENERIC_API_VERSION) +apertiumlib = $(prefix)/lib +apertiumsysconf = $(prefix)/etc/apertium + +apertium_DATA = deformat.xsl reformat.xsl new2old.xsl lexchoice.xsl \ + lexchoicebil.xsl \ + tagger.dtd interchunk.dtd format.dtd transfer.dtd postchunk.dtd modes.dtd \ + tagger.rnc interchunk.rnc format.rnc transfer.rnc postchunk.rnc modes.rnc \ + modes2bash.xsl modes2debugmodes.xsl \ + apertium-createmodes.awk + +apertium_pretransfer_SOURCES = apertium_pretransfer.cc +apertium_multiple_translations_SOURCES = apertium-multiple-translations.cc +apertium_multiple_translations_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) +apertium_destxt_SOURCES = apertium_destxt.cc +apertium_retxt_SOURCES = apertium_retxt.cc +apertium_deshtml_SOURCES = apertium_deshtml.cc +apertium_rehtml_SOURCES = apertium_rehtml.cc +apertium_rehtml_noent_SOURCES = apertium_rehtml_noent.cc +apertium_desxpresstag_SOURCES = apertium_desxpresstag.cc +apertium_rexpresstag_SOURCES = apertium_rexpresstag.cc +apertium_desodt_SOURCES = apertium_desodt.cc +apertium_reodt_SOURCES = apertium_reodt.cc +apertium_desrtf_SOURCES = apertium_desrtf.cc +apertium_rertf_SOURCES = apertium_rertf.cc +apertium_deswxml_SOURCES = apertium_deswxml.cc +apertium_rewxml_SOURCES = apertium_rewxml.cc +apertium_deslatex_SOURCES = apertium_deslatex.cc +apertium_relatex_SOURCES = apertium_relatex.cc +apertium_desxlsx_SOURCES = apertium_desxlsx.cc +apertium_rexlsx_SOURCES = apertium_rexlsx.cc +apertium_despptx_SOURCES = apertium_despptx.cc +apertium_repptx_SOURCES = apertium_repptx.cc +apertium_desmediawiki_SOURCES = apertium_desmediawiki.cc +apertium_remediawiki_SOURCES = apertium_remediawiki.cc +apertium_prelatex_SOURCES = apertium_prelatex.cc +apertium_prelatex_LDADD= $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) +apertium_postlatex_SOURCES = apertium_postlatex.cc +apertium_postlatex_LDADD= $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) +apertium_postlatex_raw_SOURCES = apertium_postlatex_raw.cc +apertium_postlatex_raw_LDADD= $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_tagger_SOURCES = apertium_tagger.cc +apertium_tagger_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_tmxbuild_SOURCES = apertium_tmxbuild.cc +apertium_tmxbuild_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_preprocess_transfer_SOURCES = transferpp.cc +apertium_preprocess_transfer_LDADD = $(APERTIUM_LIBS) \ + -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_filter_ambiguity_SOURCES = apertium_filter_ambiguity.cc +apertium_filter_ambiguity_LDADD = $(APERTIUM_LIBS) \ + -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_transfer_SOURCES = apertium_transfer.cc +apertium_transfer_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_interchunk_SOURCES = apertium_interchunk.cc +apertium_interchunk_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_postchunk_SOURCES = apertium_postchunk.cc +apertium_postchunk_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +###apertium_lextor_SOURCES = apertium_lextor.cc +###apertium_lextor_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +#apertium_lextor_eval_SOURCES = apertium-lextor-eval.C +#apertium_lextor_eval_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_tagger_apply_new_rules_SOURCES = apertium_tagger_apply_new_rules.cc +apertium_tagger_apply_new_rules_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +apertium_tagger_readwords_SOURCES = apertium_tagger_readwords.cc +apertium_tagger_readwords_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +###apertium_lextor_search_SOURCES = apertium-lextor-search.C +###apertium_lextor_search_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +###pruebas_lextor_SOURCES = pruebas-lextor.C +###pruebas_lextor_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + +###apertium_gen_wlist_lextor_translation_SOURCES = apertium_gen_wlist_lextor_translation.cc +###apertium_gen_wlist_lextor_translation_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) + + +if WINDOWS +INCLUDES = -I$(top_srcdir)/apertium/win32 -I$(top_srcdir) $(APERTIUM_CFLAGS) +else +INCLUDES = -I$(top_srcdir) $(APERTIUM_CFLAGS) +endif +CLEANFILES = *~ apertium_destxt.cc apertium_retxt.cc apertium_deshtml.cc \ + apertium_rehtml.cc apertium_desrtf.cc apertium_rertf.cc \ + apertium_rehtml_noent.cc \ + apertium_deswxml.cc apertium_rewxml.cc \ + apertium_deslatex.cc apertium_relatex.cc \ + apertium_desxlsx.cc apertium_rexlsx.cc \ + apertium_despptx.cc apertium_repptx.cc \ + apertium_desodt.cc apertium_reodt.cc \ + apertium_desxpresstag.cc apertium_rexpresstag.cc \ + apertium_desmediawiki.cc apertium_remediawiki.cc \ + apertium_prelatex.cc apertium_postlatex.cc \ + $(GENERATEDSCRIPTS) + +apertium_destxt.cc: txt-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl txt-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_desxpresstag.cc: xpresstag-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl xpresstag-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rexpresstag.cc: xpresstag-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl xpresstag-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_retxt.cc: txt-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl txt-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_deshtml.cc: html-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl html-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rehtml.cc: html-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl html-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rehtml_noent.cc: html-noent-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl html-noent-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_desodt.cc: odt-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl odt-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_reodt.cc: odt-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl odt-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_desrtf.cc: rtf-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl rtf-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rertf.cc: rtf-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl rtf-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_deswxml.cc: wxml-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl wxml-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rewxml.cc: wxml-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl wxml-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_deslatex.cc: latex-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl latex-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_relatex.cc: latex-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl latex-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + + + +apertium_desxlsx.cc: xlsx-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl xlsx-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_rexlsx.cc: xlsx-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl xlsx-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_despptx.cc: pptx-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl pptx-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_repptx.cc: pptx-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl pptx-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_desmediawiki.cc: mediawiki-format.xml Makefile.am deformat.xsl + $(XSLTPROC) deformat.xsl mediawiki-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_remediawiki.cc: mediawiki-format.xml Makefile.am reformat.xsl + $(XSLTPROC) reformat.xsl mediawiki-format.xml >$@tmp + $(FLEX) -Cfer -o$@ $@tmp + rm $@tmp + +apertium_prelatex.cc: apertium-prelatex.l + $(FLEX) -Cfer -o$@ apertium-prelatex.l + +apertium_postlatex.cc: apertium-postlatex.l + $(FLEX) -Cfer -o$@ apertium-postlatex.l + +apertium_postlatex_raw.cc: apertium-postlatex-raw.l + $(FLEX) -Cfer -o$@ apertium-postlatex-raw.l + +apertium-validate-tagger: Makefile.am validate-header.sh + @echo "Creating apertium-validate-tagger script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/tagger.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-transfer: Makefile.am validate-header.sh + @echo "Creating apertium-validate-transfer script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/transfer.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-interchunk: Makefile.am validate-header.sh + @echo "Creating apertium-validate-interchunk script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/interchunk.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-postchunk: Makefile.am validate-header.sh + @echo "Creating apertium-validate-postchunk script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/postchunk.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-acx: Makefile.am validate-header.sh + @echo "Creating apertium-validate-acx script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --relaxng \"$(prefix)\"/share/lttoolbox/acx.rng --schema \"$(prefix)\"/share/lttoolbox/acx.xsd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + +apertium-validate-modes: Makefile.am validate-header.sh + @echo "Creating apertium-validate-modes script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/modes.dtd --noout \"\$$FILE1\"" >>$@ + @chmod a+x $@ + + +apertium-validate-dictionary: Makefile.am validate-header.sh + @echo "Creating apertium-validate-dictionary script" + @echo "#!$(BASH)" > $@ + @cat validate-header.sh >> $@ + @echo "# xsd is a non-final command, so just treated as a warning when compiling:" >> $@ + @echo "$(XMLLINT) --schema \"$(prefix)\"/share/lttoolbox/dix.xsd --noout \"\$$FILE1\" | grep -vF ' fails to validate'" >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(prefix)\"/share/lttoolbox/dix.dtd --noout \"\$$FILE1\"" >> $@ + @chmod a+x $@ + +apertium-gen-deformat: Makefile.am deformat-header.sh + @echo "Creating apertium-gen-deformat script" + @echo "#!$(BASH)" > $@ + @cat deformat-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/format.dtd --noout \$$FILE1 && \\" >> $@ + @if [ `basename $(XSLTPROC)` == xsltproc ]; \ + then echo "$(XSLTPROC) --stringparam mode \$$MODE \"$(apertiumdir)\"/deformat.xsl \$$FILE1 >/tmp/\$$\$$.deformat.l && \\"; \ + else echo "$(XSLTPROC) \"$(apertiumdir)\"/deformat.xsl \$$FILE1 \"\\\$$mode=\$$MODE\" >/tmp/\$$\$$.deformat.l && \\"; \ + fi >> $@ + @echo "$(FLEX) \$$FLEXOPTS -o/tmp/\$$\$$.lex.cc /tmp/\$$\$$.deformat.l && \\" >> $@ + @echo "$(CXX) -DGENFORMAT $(CXXFLAGS) -w $(APERTIUM_CFLAGS) -I $(apertiuminclude) -o \$$FILE2 /tmp/\$$\$$.lex.cc $(APERTIUM_LIBS) 2>/dev/null && \\" >> $@ + @echo "rm /tmp/\$$\$$.deformat.l /tmp/\$$\$$.lex.cc" >> $@ + @chmod a+x $@ + +apertium-gen-reformat: Makefile.am gen-header.sh + @echo "Creating apertium-gen-reformat script" + @echo "#!$(BASH)" > $@ + @cat gen-header.sh >> $@ + @echo "$(XMLLINT) --dtdvalid \"$(apertiumdir)\"/format.dtd --noout \$$FILE1 && \\" >> $@ + @echo "$(XSLTPROC) \"$(apertiumdir)\"/reformat.xsl \$$FILE1 >/tmp/\$$\$$.reformat.l && \\" >> $@ + @echo "$(FLEX) \$$FLEXOPTS -o/tmp/\$$\$$.lex.cc /tmp/\$$\$$.reformat.l && \\" >> $@ + @echo "$(CXX) -DGENFORMAT $(CXXFLAGS) -w $(APERTIUM_CFLAGS) -I $(apertiuminclude) -o \$$FILE2 /tmp/\$$\$$.lex.cc $(APERTIUM_LIBS) 2>/dev/null &&\\" >> $@ + @echo "rm /tmp/\$$\$$.reformat.l /tmp/\$$\$$.lex.cc" >> $@ + @chmod a+x $@ + +apertium-gen-modes: apertium-gen-modes.in Makefile.am + @echo "#!$(BASH)" > $@ + @echo "APERTIUMDIR=$(apertiumdir)" >> $@ + @echo "XMLLINT=$(XMLLINT)" >> $@ + @echo "XSLTPROC=$(XSLTPROC)" >> $@ + @cat $< >> $@ + @chmod a+x $@ + +apertium-utils-fixlatex: Makefile.am utils-fixlatex-header.sh + @echo "Creating apertium-utils-fixlatex script" + @echo "#!$(BASH)" > $@ + @cat utils-fixlatex-header.sh >> $@ + @chmod a+x $@ + +apertium: Makefile.am apertium-header.sh + @echo "Creating apertium script" + @echo "#!$(BASH)" > $@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "DEFAULT_DIRECTORY=\"$(prefix)/share/apertium\"" >>$@ + @cat apertium-header.sh >>$@ + @chmod a+x $@ + +apertium-unformat: Makefile.am apertium-unformat-header.sh + @echo "Creating apertium-unformat script" + @echo "#!$(BASH)" > $@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "DEFAULT_DIRECTORY=\"$(prefix)/share/apertium\"" >>$@ + @cat apertium-unformat-header.sh >>$@ + @chmod a+x $@ + + +#apertium-translator-lextor: Makefile.am trans-lextor-header.sh +# @echo "Creating apertium-translator-lextor script" +# @echo "#!$(BASH)" > $@ +# @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ +# @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ +# @cat trans-lextor-header.sh >>$@ +# @chmod a+x $@ + +#apertium-gen-oldbil: Makefile.am transformdicbil-header.sh +# @echo "Creating apertium-gen-oldbil script" +# @echo "#!$(BASH)" >$@ +# @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ +# @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ +# @echo "XSLTPROC_OPTIONS=\"\"">>$@ +# @echo "STYLESHEET=\"$(apertiumdir)/new2old.xsl\"">>$@ +# @cat transformdicbil-header.sh >>$@ +# @chmod a+x $@ + +apertium-gen-lextorbil: Makefile.am transformdic-header.sh + @echo "Creating apertium-gen-lextorbil script" + @echo "#!$(BASH)" >$@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "XSLTPROC_OPTIONS_LR=\"\"">>$@ + @echo "XSLTPROC_OPTIONS_RL=\"--stringparam r2l yes\"">>$@ + @echo "STYLESHEET=\"$(apertiumdir)/lexchoicebil.xsl\"">>$@ + @cat transformdic-header.sh >>$@ + @chmod a+x $@ + +apertium-gen-lextormono: Makefile.am transformdic-header.sh + @echo "Creating apertium-gen-lextormono script" + @echo "#!$(BASH)" >$@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "XSLTPROC_OPTIONS_LR=\"\"">>$@ + @echo "XSLTPROC_OPTIONS_RL=\"--stringparam r2l yes\"">>$@ + @echo "STYLESHEET=\"$(apertiumdir)/lexchoice.xsl\"">>$@ + @cat transformdic-header.sh >>$@ + @chmod a+x $@ + +apertium-gen-wlist-lextor: Makefile.am gen-wlist-lextor-header.sh + @echo "Creating apertium-gen-wlist-lextor script" + @echo "#!$(BASH)" >$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @cat gen-wlist-lextor-header.sh >>$@ + @chmod a+x $@ + +apertium-preprocess-corpus-lextor: Makefile.am preprocess-corpus-lextor.sh + @echo "Creating apertium-preprocess-corpus-lextor script" + @echo "#!$(BASH)" >$@ + @echo "LTTOOLBOX_PATH=\"$(prefix)/bin\"" >>$@ + @echo "APERTIUM_PATH=\"$(prefix)/bin\"" >>$@ + @cat preprocess-corpus-lextor.sh >>$@ + @chmod a+x $@ + +apertium-gen-stopwords-lextor: Makefile.am gen-stopwords-lextor.sh + @echo "Creating apertium-gen-stopwords-lextor script" + @echo "#!$(BASH)" >$@ + @cat gen-stopwords-lextor.sh >>$@ + @chmod a+x $@ + +man_MANS=apertium.1 apertium-deshtml.1 apertium-desrtf.1 apertium-destxt.1 \ + apertium-desodt.1 apertium-reodt.1 \ + apertium-deswxml.1 apertium-rewxml.1 \ + apertium-deslatex.1 apertium-relatex.1 \ + apertium-prelatex.1 apertium-postlatex.1 apertium-postlatex-raw.1 \ + apertium-desxlsx.1 apertium-rexlsx.1 \ + apertium-despptx.1 apertium-repptx.1 \ + apertium-desmediawiki.1 apertium-remediawiki.1 \ + apertium-filter-ambiguity.1 apertium-gen-deformat.1 \ + apertium-gen-reformat.1 \ + apertium-preprocess-transfer.1 apertium-pretransfer.1 apertium-rehtml.1 \ + apertium-rertf.1 apertium-retxt.1 apertium-tagger.1 apertium-transfer.1 \ + apertium-validate-dictionary.1 apertium-validate-tagger.1 \ + apertium-validate-transfer.1 apertium-gen-modes.1 apertium-interchunk.1 \ + apertium-postchunk.1 apertium-validate-interchunk.1 apertium-utils-fixlatex.1 \ + apertium-validate-postchunk.1 apertium-validate-modes.1 apertium-tagger-apply-new-rules.1 \ + apertium-validate-acx.1 apertium-multiple-translations.1 \ + apertium-unformat.1 +#DEPR.: +# apertium-lextor-eval.1 +# apertium-gen-lextorbil.1 +# apertium-gen-lextormono.1 apertium-gen-stopwords-lextor.1 +# apertium-gen-wlist-lextor.1 apertium-gen-wlist-lextor-translation.1 +# apertium-lextor.1 apertium-preprocess-corpus-lextor.1 + +EXTRA_DIST = gen-header.sh deformat-header.sh \ + reformat.xsl deformat.xsl new2old.xsl lexchoice.xsl lexchoicebil.xsl \ + txt-format.xml \ + html-format.xml odt-format.xml rtf-format.xml wxml-format.xml latex-format.xml\ + html-noent-format.xml \ + xlsx-format.xml pptx-format.xml mediawiki-format.xml trans-header.sh \ + apertium-postlatex.l apertium-postlatex-raw.l apertium-prelatex.l \ + apertium-header.sh apertium-unformat-header.sh $(man_MANS) \ + xpresstag-format.xml \ + validate-header.sh transformdic-header.sh transformdicbil-header.sh \ + tagger.dtd interchunk.dtd format.dtd transfer.dtd postchunk.dtd modes.dtd \ + tagger.rnc interchunk.rnc format.rnc transfer.rnc postchunk.rnc modes.rnc \ + utils-fixlatex-header.sh \ + apertium-gen-modes.in apertium-createmodes.awk modes2bash.xsl modes2debugmodes.xsl +#DEPR.: +# trans-lextor-header.sh +# gen-wlist-lextor-header.sh +# gen-stopwords-lextor.sh +# preprocess-corpus-lextor.sh Index: branches/apertium-tagger/apertium2/apertium/validate-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/validate-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/validate-header.sh (revision 69632) @@ -0,0 +1,12 @@ +if [[ $# != 1 ]]; then + echo "USAGE: $(basename "$0") " + exit 1 +fi + +FILE1=$1 + +if [[ ! -e $FILE1 ]]; then + echo "ERROR: '$1' file not found" + exit 1 +fi + Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-modes.in =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-modes.in (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-modes.in (revision 69632) @@ -0,0 +1,85 @@ +#!/bin/bash +# Makefile.am prepends APERTIUMDIR, XMLLINT, XSLTPROC and the right shebang + +show_help () { + cat <&2 + exit 1 + ;; + esac +done +shift $((OPTIND-1)) + +xmlfile="$1" +if [[ ! -e "${xmlfile}" ]]; then + echo "ERROR: '${xmlfile}' file not found" + exit 1 +fi +xmldir=$(cd "$(dirname "${xmlfile}")"; pwd) + +case $# in + 1) installdir="${xmldir}";; + 2) if ${fullpath}; then + installdir="$2" + else + installdir="${APERTIUMDIR}/$2" + fi + ;; + *) show_help >&2 + exit 1 + ;; +esac + +$verbose && set -x +set -o pipefail # introduced in bash 3; available in OSX>=10.5; should be safe + +[[ -d "${xmldir}"/modes ]] || mkdir "${xmldir}"/modes + +"${XMLLINT}" --dtdvalid "${APERTIUMDIR}"/modes.dtd --noout "${xmlfile}" || exit $? + +"${XSLTPROC}" "${APERTIUMDIR}"/modes2debugmodes.xsl "${xmlfile}" \ + | "${XSLTPROC}" --stringparam devdir "${xmldir}" \ + --stringparam installdir "${installdir}" \ + "${APERTIUMDIR}"/modes2bash.xsl \ + - \ + | awk -f "${APERTIUMDIR}"/apertium-createmodes.awk Index: branches/apertium-tagger/apertium2/apertium/hmm.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/hmm.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/hmm.h (revision 69632) @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/** + * First order hidden Markov model (HMM) implementation (header) + * + * @author Felipe Sánchez-Martínez - fsanchez@dlsi.ua.es + */ + +#ifndef __HMM_H +#define __HMM_H + +#include "file_tagger.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +#define ZERO 1e-10 + +/** HMM + * first-order hidden Markov Model + */ +class HMM : public Apertium::FILE_Tagger { +private: + TaggerDataHMM tdhmm; + TTag eos; // end-of-sentence tag + + /** It allocs memory for the transition (a) and the emission (b) matrices. + * Before calling this method the number of ambiguity classes must be known. + * This methos is called within read_ambiguity_classes and read_dictionary. + * @see: read_ambiguity_classes, read_dictionary + */ + void init(); +public: + void deserialise(FILE *Serialised_FILE_Tagger); + std::vector &getArrayTags(); + void train(FILE *Corpus, unsigned long Count); + void serialise(FILE *Stream_); + void deserialise(const TaggerData &Deserialised_FILE_Tagger); + void init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *UntaggedCorpus); + void init_probabilities_kupiec_(FILE *Corpus); + HMM(); + HMM(TaggerDataHMM *tdhmm); + + /** Constructor + */ + HMM(TaggerDataHMM tdhmm); + + /** Destructor + */ + ~HMM(); + + /** Used to set the end-of-sentence tag + * @param t the end-of-sentence tag + */ + void set_eos(TTag t); + + /** It reads the ambiguity classes from the stream received as + * input + * @param is the input stream + */ + void read_ambiguity_classes(FILE *in); + + /** It writes the ambiguity classes to the stream received as + * a parameter + * @param iosthe output stream + */ + void write_ambiguity_classes(FILE *out); + + /** It reads the probabilities (matrices a and b) from the stream + * received as a parameter + * @param is the input stream + */ + void read_probabilities(FILE *in); + + /** It writes the probabilities (matrices a and b) to the stream + * received as a parameter + * @param os the output stream + */ + void write_probabilities(FILE *out); + + /** It reads the expanded dictionary received as a parameter and calculates + * the set of ambiguity classes that the tagger will manage. + * @param is the input stream with the expanded dictionary to read + */ + void read_dictionary(FILE *is); + + /** It initializes the transtion (a) and emission (b) probabilities + * from an untagged input text by means of Kupiec's method + * @param is the input stream with the untagged corpus to process + */ + void init_probabilities_kupiec (FILE *is); + + /** It initializes the transtion (a) and emission (b) probabilities + * from a tagged input text by means of the expected-likelihood + * estimate (ELE) method + * @param ftagged the input stream with the tagged corpus to process + * @param funtagged the same corpus to process but untagged + */ + void init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged); + + /** It applies the forbid and enforce rules found in tagger specification. + * To do so the transition matrix is modified by introducing null probabilities + * in the involved transitions. + */ + void apply_rules(); + + /** Unsupervised training algorithm (Baum-Welch implementation). + * @param is the input stream with the untagged corpus to process + */ + void train (FILE *is); + + /** Tagging algorithm (Viterbi implementation). + * @param in the input stream with the untagged text to tag + * @param out the output stream with the tagged text + */ + void tagger(FILE *Input, FILE *Output, const bool &First = false); + + /** Prints the A matrix. + */ + void print_A(); + + /** Prints the B matrix. + */ + void print_B(); + + /** Prints the ambiguity classes. + */ + void print_ambiguity_classes(); + + void filter_ambiguity_classes(FILE *in, FILE *out); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/lswpost.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/lswpost.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lswpost.h (revision 69632) @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/** + * Light Sliding-Window Part of Speech Tagger (LSWPoST) implementation (header) + * + * @author Gang Chen - pkuchengang@gmail.com + */ + +#ifndef __LSWPOST_H +#define __LSWPOST_H + +#include "file_tagger.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + + +#define ZERO 1e-10 + +/** LSWPoST + * Light Sliding-Window Part of Speech Tagger + */ +class LSWPoST : public Apertium::FILE_Tagger { +private: + TaggerDataLSW tdlsw; + TTag eos; // end-of-sentence tag + +public: + void deserialise(FILE *Serialised_FILE_Tagger); + std::vector &getArrayTags(); + void train(FILE *Corpus, unsigned long Count); + void serialise(FILE *Stream_); + void deserialise(const TaggerData &Deserialised_FILE_Tagger); + void init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *UntaggedCorpus); + void init_probabilities_kupiec_(FILE *Corpus); + LSWPoST(); + LSWPoST(TaggerDataLSW *tdlsw); + + /** Constructor + */ + LSWPoST(TaggerDataLSW t); + + /** Destructor + */ + ~LSWPoST(); + + /** Used to set the end-of-sentence tag + * @param t the end-of-sentence tag + */ + void set_eos(TTag t); + + /** It reads the expanded dictionary received as a parameter and calculates + * the set of ambiguity classes that the tagger will manage. + * @param fdic the input stream with the expanded dictionary to read + */ + void read_dictionary(FILE *fdic); + + /** Whether a tag sequence is valid, according to the forbid and enforce rules + */ + bool is_valid_seq(TTag left, TTag mid, TTag right); + + /** Init probabilities + * It applies the forbid and enforce rules found in tagger specification. + * To do so, the joint probability of a tag sequence that contains a forbid + * rule, or doesn't satisfy a enforce rule, is set to 0. + */ + void init_probabilities(FILE *ftxt); + + /** Unsupervised training algorithm (Baum-Welch implementation). + * @param ftxt the input stream with the untagged corpus to process + */ + void train (FILE *ftxt); + + /** Prints the para matrix. + */ + void print_para_matrix(); + + /** Do the tagging + */ + void tagger(FILE *Input, FILE *Output, const bool &First = false); +}; +#endif Index: branches/apertium-tagger/apertium2/apertium/file_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/file_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/file_tagger.h (revision 69632) @@ -0,0 +1,52 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef FILE_TAGGER_H +#define FILE_TAGGER_H + +#include + +#include +#include +#include + +namespace Apertium { +class FILE_Tagger { +public: + FILE_Tagger(); + virtual ~FILE_Tagger(); + virtual void deserialise(FILE *Serialised_FILE_Tagger) = 0; + void set_debug(const bool &Debug); + void set_show_sf(const bool &ShowSuperficial); + void setNullFlush(const bool &NullFlush); + virtual void tagger(FILE *Input, FILE *Output, const bool &First = false) = 0; + virtual std::vector &getArrayTags() = 0; + virtual void train(FILE *Corpus, unsigned long Count) = 0; + virtual void serialise(FILE *Stream_) = 0; + void deserialise(char *const TaggerSpecificationFilename); + virtual void read_dictionary(FILE *Dictionary) = 0; + virtual void init_probabilities_from_tagged_text_(FILE *TaggedCorpus, + FILE *Corpus) = 0; + virtual void init_probabilities_kupiec_(FILE *Corpus) = 0; + +protected: + virtual void deserialise(const TaggerData &Deserialised_FILE_Tagger) = 0; + bool debug; + bool show_sf; + bool null_flush; +}; +} + +#endif // FILE_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/format.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/format.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/format.rng (revision 69632) @@ -0,0 +1,303 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + yes + no + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + comment + empty + open + close + + + + + + + + + yes + no + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + yes + no + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/interchunk.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk.rng (revision 69632) @@ -0,0 +1,971 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/postchunk.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/postchunk.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/postchunk.rng (revision 69632) @@ -0,0 +1,971 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/tagger.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger.rng (revision 69632) @@ -0,0 +1,310 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/format.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/format.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/format.rnc (revision 69632) @@ -0,0 +1,111 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# DTD for the format specification files +# Sergio Ortiz 2005.05.13 + +format = element format { attlist.format, options, rules } +attlist.format &= attribute name { text } +# 'format' is the root element containing the whole format specification +# file. The attribute 'name' specifies the name of the format +options = + element options { + attlist.options, + largeblocks, + input, + output, + tag-name, + escape-chars, + space-chars, + case-sensitive + } +attlist.options &= empty +# General options of the format +largeblocks = element largeblocks { attlist.largeblocks, empty } +attlist.largeblocks &= attribute size { text } +# The attribute size is used to define the maximal size in bytes of +# inline format blocks +input = element input { attlist.input, empty } +attlist.input &= attribute zip-path { text }? +attlist.input &= attribute encoding { text } +# Reserved for future extensions +output = element output { attlist.output, empty } +attlist.output &= attribute zip-path { text }? +attlist.output &= attribute encoding { text } +# Reserved for future extensions +tag-name = element tag-name { attlist.tag-name, empty } +attlist.tag-name &= attribute regexp { text } +# The attribute regexp defines (whith a _flex_ regular expression) how +# take a tag name from a whole tag. '\' +escape-chars = element escape-chars { attlist.escape-chars, empty } +attlist.escape-chars &= attribute regexp { text } +# The attribute regexp defines (whith a _flex_ regular expression) the +# set of characters to be escaped with preceding a backslash '\' +space-chars = element space-chars { attlist.space-chars, empty } +attlist.space-chars &= attribute regexp { text } +# Define the space characters (in regexp) with a _flex_ regular +# expression +case-sensitive = + element case-sensitive { attlist.case-sensitive, empty } +attlist.case-sensitive &= attribute value { "yes" | "no" } +# The attribute 'value' is set to 'yes' if the case is relevant in the +# specification of the format. Otherwise is set to 'no' +rules = + element rules { attlist.rules, (format-rule | replacement-rule)+ } +attlist.rules &= empty +# Group the rules of processing format and the rules of substitute +# expressions by characters that are part of the text +format-rule = + element format-rule { + attlist.format-rule, + (tag | (begin, end)) + } +attlist.format-rule &= + attribute type { "comment" | "empty" | "open" | "close" }? +attlist.format-rule &= attribute eos { "yes" | "no" }? +attlist.format-rule &= attribute priority { text } +# Format rule parent element. It may include a 'tag' element or +# a couple of elements 'begin', 'end'. In the first case, this element is +# considered to be part of the format. In the second case, the begin and +# the end element are considered to enclosing format. The attribute +# 'eos' (end of sentence) is set to 'yes' if that rule defines a dot in +# the text being processed (is no by default). The attribute 'priority' +# marks the order of precedence of the rule +tag = element tag { attlist.tag, empty } +attlist.tag &= attribute regexp { text } +# Define an element that is part of the format by the pattern specified +# as a value for the regexp attribute +begin = element begin { attlist.begin, empty } +attlist.begin &= attribute regexp { text } +# The attribute 'regexp' is the regular expression that detects the +# begining delimiter of a block of format +end = element end { attlist.end, empty } +attlist.end &= attribute regexp { text } +# The attribute 'regexp' is the regular expression that detects the +# ending delimiter of a block of format +replacement-rule = + element replacement-rule { attlist.replacement-rule, replace+ } +attlist.replacement-rule &= attribute regexp { text } +# Root element for a replacement rule. The attribute 'regexp' is the +# general expression to detect the elements to replace +replace = element replace { attlist.replace, empty } +attlist.replace &= attribute source { text } +attlist.replace &= attribute target { text } +attlist.replace &= attribute prefer { "yes" | "no" }? +start = format +# Replacement rule. The 'source' is a string of one or more characters. +# The 'target' MUST be a single character. The 'prefer' attribute, when +# set to 'yes' defines the preferred reverse translation of the +# replacement. Index: branches/apertium-tagger/apertium2/apertium/interchunk.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk.rnc (revision 69632) @@ -0,0 +1,353 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# Draft of DTD for the structural transfer rule files +# +# Sergio Ortiz, Gema RamĂ­rez-SĂĄnchez, Mireia GinestĂ­, Mikel L. Forcada, +# 2005.07.29. + +condition = + and + | or + | not + | equal + | begins-with + | begins-with-list + | ends-with + | ends-with-list + | contains-substring + | in +container = var | clip +sentence = let | out | choose | modify-case | call-macro | append +value = + b + | clip + | lit + | lit-tag + | var + | get-case-from + | case-of + | concat + | chunk +stringvalue = clip | lit | var | get-case-from | case-of +interchunk = + element interchunk { + attlist.interchunk, + section-def-cats, + section-def-attrs, + section-def-vars, + section-def-lists?, + section-def-macros?, + section-rules + } +attlist.interchunk &= empty +# 'interchunk' is the root element containing the whole structural +# interchunk rule file. +section-def-cats = + element section-def-cats { attlist.section-def-cats, def-cat+ } +attlist.section-def-cats &= empty +# The 'def-cats' section defines the categories used to build the +# patterns used in rules +def-cat = element def-cat { attlist.def-cat, cat-item+ } +attlist.def-cat &= + attribute n { xsd:ID }, + attribute c { text }? +# Each 'def-cat' defines one category in terms of a list of +# category items and has a unique name 'n', which is mandatory +cat-item = element cat-item { attlist.cat-item, empty } +attlist.cat-item &= + attribute lemma { text }?, + attribute tags { text }, + attribute c { text }? +# Each 'cat-item' (category item) represents a set of lexical forms +# and has a mandatory attribute 'tags' whose value is a sequence of +# dot-separated tag names; this sequence is a subsequence of the +# tag sequence defining each possible lexical form. For example, +# tags="n.f" would match all lexical forms containing this tag +# sequence, such as "^casa$". +# +# In addition, an optional attribute, "lemma", may be used to +# define lexical forms having a particular substring in their lemma +section-def-attrs = + element section-def-attrs { attlist.section-def-attrs, def-attr+ } +attlist.section-def-attrs &= empty +# The 'def-attrs' section defines the attributes that will be +# identified in matched lexical forms +def-attr = element def-attr { attlist.def-attr, attr-item+ } +attlist.def-attr &= + attribute n { xsd:ID }, + attribute c { text }? +# Each def-attr defines one attribute in terms of a list of +# attribute items and has a mandatory unique name n +attr-item = element attr-item { attlist.attr-item, empty } +attlist.attr-item &= + attribute tags { text }?, + attribute c { text }? +# Each 'attr-item' specifies a subsequence of the tags in +# that lexical form (attribute 'tags') +section-def-vars = + element section-def-vars { attlist.section-def-vars, def-var+ } +attlist.section-def-vars &= empty +# The 'def-vars' section defines the global variables +# that will be used to transfer information between rules +def-var = element def-var { attlist.def-var, empty } +attlist.def-var &= + attribute n { xsd:ID }, + attribute v { text }?, + attribute c { text }? +# The definition of a global variable has a mandatory unique name 'n' that +# will be used to refer to it. A value of initialization can also be specified +# by means the 'v' attribute. The default value of the initialization is the +# empty string. +section-def-lists = + element section-def-lists { attlist.section-def-lists, def-list+ } +attlist.section-def-lists &= empty +# Element 'section-def-lists' encloses a set of list definitions +def-list = element def-list { attlist.def-list, list-item+ } +attlist.def-list &= + attribute n { xsd:ID }, + attribute c { text }? +# The 'def-list' element defines a named list to search with the 'in' +# element. Attribute 'n' sets the name of the list +list-item = element list-item { attlist.list-item, empty } +attlist.list-item &= + attribute v { text }, + attribute c { text }? +# Attribute 'v' of 'list-item' element contains the value to be added to +# the list being defined +section-def-macros = + element section-def-macros { attlist.section-def-macros, def-macro+ } +attlist.section-def-macros &= empty +# +# The 'def-macros' section defines macros containing portions of +# code frequently used in the action part of rules +# +def-macro = element def-macro { attlist.def-macro, sentence+ } +attlist.def-macro &= attribute n { xsd:ID } +attlist.def-macro &= + attribute npar { text }, + attribute c { text }? +# Macro definition: +# +# A macro has a mandatory name (the value of 'n'), a number of parameters +# (the value of 'npar') and a body containing arguments and statements. +section-rules = element section-rules { attlist.section-rules, rule+ } +attlist.section-rules &= empty +# The rules section contains a sequence of one or more rules +rule = element rule { attlist.rule, pattern, action } +attlist.rule &= attribute comment { text }? +# Each rule has a pattern and an action +# * attribute 'comment' allows to put in comments about the purpose of +# the rule being defined +pattern = element pattern { attlist.pattern, pattern-item+ } +attlist.pattern &= empty +# The pattern is specified in terms of pattern items, each one +# representing a lexical form in the matched pattern +pattern-item = element pattern-item { attlist.pattern-item, empty } +attlist.pattern-item &= attribute n { xsd:IDREF } +# Each attribute to be activated is referred to by its name in the def-cats section +action = element action { attlist.action, sentence* } +attlist.action &= attribute c { text }? +# Encloses the procedural part of a rule +choose = element choose { attlist.choose, when+, otherwise? } +attlist.choose &= attribute c { text }? +# The choose statement is a selection statement (similar to a case +# statement) composed of one or more tested cases and an optional +# otherwise +when = element when { attlist.when, test, sentence* } +attlist.when &= attribute c { text }? +# Each tested case is a block of zero or more statements +otherwise = element otherwise { attlist.otherwise, sentence+ } +attlist.otherwise &= attribute c { text }? +# The otherwise case is also a block of one or more statements +test = element test { attlist.test, condition } +attlist.test &= attribute c { text }? +# The test in a tested case may be a conjunction, a disjunction, or +# a negation of simpler tests, as well as a simple equality test +and = element and { attlist.and, condition, condition+ } +attlist.and &= empty +# Each conjuntion test contains two or more simpler tests +or = element or { attlist.or, condition, condition+ } +attlist.or &= empty +# Each disjunction test contains two or more simpler tests +not = element not { attlist.not, condition } +attlist.not &= empty +# The negation of a simpler test is a test itself +equal = element equal { attlist.equal, value, value } +attlist.equal &= attribute caseless { "no" | "yes" }? +# The simplest test is an equality test. The right part and the +# left part of the equality may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with = element begins-with { attlist.begins-with, value, value } +attlist.begins-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with = element ends-with { attlist.ends-with, value, value } +attlist.ends-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with-list = + element begins-with-list { attlist.begins-with-list, value, \list } +attlist.begins-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with-list = + element ends-with-list { attlist.ends-with-list, value, \list } +attlist.ends-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +contains-substring = + element contains-substring { + attlist.contains-substring, value, value + } +attlist.contains-substring &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +in = element in { attlist.in, value, \list } +attlist.in &= attribute caseless { "no" | "yes" }? +# 'in' performs a search of a value in a list. If 'caseless' is set to yes, +# this search is performed without attending to the case +\list = element list { attlist.list, empty } +attlist.list &= attribute n { xsd:IDREF } +# 'list' refers, with the name in attribute 'n', a list defined before in +# the 'section-def-list' section +let = element let { attlist.let, container, value } +attlist.let &= empty +# An assignment statement ('let') assigns the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip +append = element append { attlist.append, value+ } +attlist.append &= attribute n { xsd:IDREF } +# This instruction appends the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip, identified by the "n" attribute +out = element out { attlist.out, (b | chunk | var)+ } +attlist.out &= attribute c { text }? +# 'out' is an output statement; it may output blanks or chunks +modify-case = + element modify-case { attlist.modify-case, container, stringvalue } +attlist.modify-case &= empty +# The first argument of 'modify-case' copy the case of the second +# argument. +call-macro = element call-macro { attlist.call-macro, with-param* } +attlist.call-macro &= attribute n { xsd:IDREF } +# A macro may be called anywhere by name with one or more +# arguments +with-param = element with-param { attlist.with-param, empty } +attlist.with-param &= attribute pos { text } +# The attribute pos in each argument is used to refer to a lexical +# form in the current rule. For example, if a 2-parameter macro +# has been defined to perform noun-adjective agreement operations, +# it may be used with arguments 1 and 2 in a noun-adjective rule, +# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with +# arguments 1 and 3 in a noun-adverb-adjective rule, and with +# arguments 2 and 1 in an adjective-noun rule +clip = element clip { attlist.clip, empty } +attlist.clip &= + attribute pos { text }, + attribute part { text }, + attribute c { text }? +# A 'clip' is a substring of a source-language or target-language +# lexical form, extracted according to an attribute: +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +# +lit = element lit { attlist.lit, empty } +attlist.lit &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +lit-tag = element lit-tag { attlist.lit-tag, empty } +attlist.lit-tag &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +var = element var { attlist.var, empty } +attlist.var &= attribute n { xsd:IDREF } +# Each 'var' is a variable identifier: the attribute n is the name +# of the variable. When it is in an 'out', a 'test', or the right +# part of a 'let', it represents the value of the variable; when in +# the left part of a 'let' it represents the reference of the +# variable. +get-case-from = + element get-case-from { attlist.get-case-from, (clip | lit | var) } +attlist.get-case-from &= attribute pos { text } +# AtenciĂłn, falta modificar todos los comentarios donde intervenga +# get-case-from +case-of = element case-of { attlist.case-of, empty } +attlist.case-of &= + attribute pos { text }, + attribute part { text } +# A 'case-of' is a value representing the case of a "clip". This value +# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA", +# (all uppercase). +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +concat = element concat { attlist.concat, value+ } +attlist.concat &= empty +# Concatenates a sequence of values +chunk = element chunk { attlist.chunk, value+ } +attlist.chunk &= empty +# Encloses a chunk +pseudolemma = element pseudolemma { attlist.pseudolemma, value } +attlist.pseudolemma &= empty +b = element b { attlist.b, empty } +attlist.b &= attribute pos { text }? +start = interchunk | pseudolemma +# 'b' is a [super]blanks item, indexed by pos; for example, a 'b' +# with pos="2" refers to the [super]blanks (including format data +# encapsulated by the de-formatter) between lexical form 2 and +# lexical form 3. Managing [super]blanks explicitly allows for the +# correct placement of format when the result of structural +# transfer has more or less lexical items than the original or has +# been reordered in some way. If attribute "pos" is not specified, then +# a single blank (ASCII 32) is generated. Index: branches/apertium-tagger/apertium2/apertium/postchunk.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/postchunk.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/postchunk.rnc (revision 69632) @@ -0,0 +1,348 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# Draft of DTD for the structural transfer rule files +# +# Sergio Ortiz, Gema RamĂ­rez-SĂĄnchez, Mireia GinestĂ­, Mikel L. Forcada, +# 2005.07.29. + +condition = + and + | or + | not + | equal + | begins-with + | begins-with-list + | ends-with + | ends-with-list + | contains-substring + | in +container = var | clip +sentence = let | out | choose | modify-case | call-macro | append +value = + b + | clip + | lit + | lit-tag + | var + | get-case-from + | case-of + | concat + | lu-count + | lu + | mlu +stringvalue = clip | lit | var | get-case-from | case-of | lu-count +postchunk = + element postchunk { + attlist.postchunk, + section-def-cats, + section-def-attrs, + section-def-vars, + section-def-lists?, + section-def-macros?, + section-rules + } +attlist.postchunk &= empty +# 'postchunk' is the root element containing the whole structural +# postchunk rule file. +section-def-cats = + element section-def-cats { attlist.section-def-cats, def-cat+ } +attlist.section-def-cats &= empty +# The 'def-cats' section defines the categories used to build the +# patterns used in rules +def-cat = element def-cat { attlist.def-cat, cat-item+ } +attlist.def-cat &= + attribute n { xsd:ID }, + attribute c { text }? +# Each 'def-cat' defines one category in terms of a list of +# category items and has a unique name 'n', which is mandatory +cat-item = element cat-item { attlist.cat-item, empty } +attlist.cat-item &= attribute name { text } +# In addition, a required attribute, "name", is used to specify +# wich chunk name is detected by this cat-item +section-def-attrs = + element section-def-attrs { attlist.section-def-attrs, def-attr+ } +attlist.section-def-attrs &= empty +# The 'def-attrs' section defines the attributes that will be +# identified in matched lexical forms +def-attr = element def-attr { attlist.def-attr, attr-item+ } +attlist.def-attr &= + attribute n { xsd:ID }, + attribute c { text }? +# Each def-attr defines one attribute in terms of a list of +# attribute items and has a mandatory unique name n +attr-item = element attr-item { attlist.attr-item, empty } +attlist.attr-item &= + attribute tags { text }?, + attribute c { text }? +# Each 'attr-item' specifies a subsequence of the tags in +# that lexical form (attribute 'tags') +section-def-vars = + element section-def-vars { attlist.section-def-vars, def-var+ } +attlist.section-def-vars &= empty +# The 'def-vars' section defines the global variables +# that will be used to transfer information between rules +def-var = element def-var { attlist.def-var, empty } +attlist.def-var &= + attribute n { xsd:ID }, + attribute v { text }?, + attribute c { text }? +# The definition of a global variable has a mandatory unique name 'n' that +# will be used to refer to it. A value of initialization can also be specified +# by means the 'v' attribute. The default value of the initialization is the +# empty string. +section-def-lists = + element section-def-lists { attlist.section-def-lists, def-list+ } +attlist.section-def-lists &= empty +# Element 'section-def-lists' encloses a set of list definitions +def-list = element def-list { attlist.def-list, list-item+ } +attlist.def-list &= + attribute n { xsd:ID }, + attribute c { text }? +# The 'def-list' element defines a named list to search with the 'in' +# element. Attribute 'n' sets the name of the list +list-item = element list-item { attlist.list-item, empty } +attlist.list-item &= + attribute v { text }, + attribute c { text }? +# Attribute 'v' of 'list-item' element contains the value to be added to +# the list being defined +section-def-macros = + element section-def-macros { attlist.section-def-macros, def-macro+ } +attlist.section-def-macros &= empty +# +# The 'def-macros' section defines macros containing portions of +# code frequently used in the action part of rules +# +def-macro = element def-macro { attlist.def-macro, sentence+ } +attlist.def-macro &= attribute n { xsd:ID } +attlist.def-macro &= + attribute npar { text }, + attribute c { text }? +# Macro definition: +# +# A macro has a mandatory name (the value of 'n'), a number of parameters +# (the value of 'npar') and a body containing arguments and statements. +section-rules = element section-rules { attlist.section-rules, rule+ } +attlist.section-rules &= empty +# The rules section contains a sequence of one or more rules +rule = element rule { attlist.rule, pattern, action } +attlist.rule &= attribute comment { text }? +# Each rule has a pattern and an action +# * Attribute 'comment' allows to include a comment with the rule +pattern = element pattern { attlist.pattern, pattern-item } +attlist.pattern &= empty +# The pattern is specified in terms of pattern items, each one +# representing a lexical form in the matched pattern +pattern-item = element pattern-item { attlist.pattern-item, empty } +attlist.pattern-item &= attribute n { xsd:IDREF } +# Each attribute to be activated is referred to by its name in the def-cats section +action = element action { attlist.action, sentence* } +attlist.action &= attribute c { text }? +# Encloses the procedural part of a rule +choose = element choose { attlist.choose, when+, otherwise? } +attlist.choose &= attribute c { text }? +# The choose statement is a selection statement (similar to a case +# statement) composed of one or more tested cases and an optional +# otherwise +when = element when { attlist.when, test, sentence* } +attlist.when &= attribute c { text }? +# Each tested case is a block of zero or more statements +otherwise = element otherwise { attlist.otherwise, sentence+ } +attlist.otherwise &= attribute c { text }? +# The otherwise case is also a block of one or more statements +test = element test { attlist.test, condition } +attlist.test &= attribute c { text }? +# The test in a tested case may be a conjunction, a disjunction, or +# a negation of simpler tests, as well as a simple equality test +and = element and { attlist.and, condition, condition+ } +attlist.and &= empty +# Each conjuntion test contains two or more simpler tests +or = element or { attlist.or, condition, condition+ } +attlist.or &= empty +# Each disjunction test contains two or more simpler tests +not = element not { attlist.not, condition } +attlist.not &= empty +# The negation of a simpler test is a test itself +equal = element equal { attlist.equal, value, value } +attlist.equal &= attribute caseless { "no" | "yes" }? +# The simplest test is an equality test. The right part and the +# left part of the equality may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with = element begins-with { attlist.begins-with, value, value } +attlist.begins-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with = element ends-with { attlist.ends-with, value, value } +attlist.ends-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with-list = + element begins-with-list { attlist.begins-with-list, value, \list } +attlist.begins-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with-list = + element ends-with-list { attlist.ends-with-list, value, \list } +attlist.ends-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +contains-substring = + element contains-substring { + attlist.contains-substring, value, value + } +attlist.contains-substring &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +in = element in { attlist.in, value, \list } +attlist.in &= attribute caseless { "no" | "yes" }? +# 'in' performs a search of a value in a list. If 'caseless' is set to yes, +# this search is performed without attending to the case +\list = element list { attlist.list, empty } +attlist.list &= attribute n { xsd:IDREF } +# 'list' refers, with the name in attribute 'n', a list defined before in +# the 'section-def-list' section +let = element let { attlist.let, container, value } +attlist.let &= empty +# An assignment statement ('let') assigns the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip +append = element append { attlist.append, value+ } +attlist.append &= attribute n { xsd:IDREF } +# This instruction appends the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip, identified by the "n" attribute +out = element out { attlist.out, (b | lu | mlu | var)+ } +attlist.out &= attribute c { text }? +# 'out' is an output statement; it may output blanks or chunks +modify-case = + element modify-case { attlist.modify-case, container, stringvalue } +attlist.modify-case &= empty +# The first argument of 'modify-case' copy the case of the second +# argument. +call-macro = element call-macro { attlist.call-macro, with-param* } +attlist.call-macro &= attribute n { xsd:IDREF } +# A macro may be called anywhere by name with one or more +# arguments +with-param = element with-param { attlist.with-param, empty } +attlist.with-param &= attribute pos { text } +# The attribute pos in each argument is used to refer to a lexical +# form in the current rule. For example, if a 2-parameter macro +# has been defined to perform noun-adjective agreement operations, +# it may be used with arguments 1 and 2 in a noun-adjective rule, +# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with +# arguments 1 and 3 in a noun-adverb-adjective rule, and with +# arguments 2 and 1 in an adjective-noun rule +clip = element clip { attlist.clip, empty } +attlist.clip &= + attribute pos { text }, + attribute part { text }, + attribute c { text }? +# A 'clip' is a substring of a source-language or target-language +# lexical form, extracted according to an attribute: +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +# +lit = element lit { attlist.lit, empty } +attlist.lit &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +lit-tag = element lit-tag { attlist.lit-tag, empty } +attlist.lit-tag &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +var = element var { attlist.var, empty } +attlist.var &= attribute n { xsd:IDREF } +# Each 'var' is a variable identifier: the attribute n is the name +# of the variable. When it is in an 'out', a 'test', or the right +# part of a 'let', it represents the value of the variable; when in +# the left part of a 'let' it represents the reference of the +# variable. +get-case-from = + element get-case-from { attlist.get-case-from, (clip | lit | var) } +attlist.get-case-from &= attribute pos { text } +# AtenciĂłn, falta modificar todos los comentarios donde intervenga +# get-case-from +case-of = element case-of { attlist.case-of, empty } +attlist.case-of &= + attribute pos { text }, + attribute part { text } +# A 'case-of' is a value representing the case of a "clip". This value +# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA", +# (all uppercase). +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +concat = element concat { attlist.concat, value+ } +attlist.concat &= empty +# Concatenates a sequence of values +mlu = element mlu { attlist.mlu, lu+ } +attlist.mlu &= empty +# Encloses a multiword +lu = element lu { attlist.lu, value+ } +attlist.lu &= empty +# Encloses a word +b = element b { attlist.b, empty } +attlist.b &= attribute pos { text }? +# 'b' is a [super]blanks item, indexed by pos; for example, a 'b' +# with pos="2" refers to the [super]blanks (including format data +# encapsulated by the de-formatter) between lexical form 2 and +# lexical form 3. Managing [super]blanks explicitly allows for the +# correct placement of format when the result of structural +# transfer has more or less lexical items than the original or has +# been reordered in some way. If attribute "pos" is not specified, then +# a single blank (ASCII 32) is generated. +lu-count = element lu-count { attlist.lu-count, empty } +attlist.lu-count &= empty +start = postchunk +# Number of lexical units (words inside the chunk) in the rule Index: branches/apertium-tagger/apertium2/apertium/tagger.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger.rnc (revision 69632) @@ -0,0 +1,122 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# DTD for the tagset and the rules to enforce the state to state +# transition probabilities used by the part-of-speech tagger. +# 2005.07.29. + +tagger = + element tagger { + attlist.tagger, + tagset, + forbid?, + enforce-rules?, + preferences?, + discard-on-ambiguity? + } +attlist.tagger &= attribute name { text } +# 'tagger' is the root element containing the whole tagset for a given +# language specified through the mandatory attribute 'name' +tagset = element tagset { attlist.tagset, def-label+, def-mult* } +attlist.tagset &= empty +# The 'tagset' section defines the correspondance between simple +# or multiple morphological categories defining a lexical form and the coarser +# ones with which the part-of-speech tagger works +def-label = element def-label { attlist.def-label, tags-item+ } +attlist.def-label &= + attribute name { text }, + attribute c { text }?, + attribute closed { text }? +# Each 'def-label' defines one coarse tag in terms of a list of fine tags +# and has a mandatory unique name. The optional attribute 'closed="true"' may be used +# to specify if the defined fine tags belong to a closed list. +# c is for comments and is ignored +tags-item = element tags-item { attlist.tags-item, empty } +attlist.tags-item &= + attribute tags { text }, + attribute lemma { text }? +# Each 'tags-item' may be a dot-separated subsequence of the morphological tags +# corresponding to a coarse tag optionally in association with a given lemma +def-mult = element def-mult { attlist.def-mult, sequence+ } +attlist.def-mult &= + attribute name { text }, + attribute c { text }?, + attribute closed { text }? +# Each 'def-mult' defines one coarse tag in terms of a sequence of coarse +# tags previously defined as 'def-labels' or a sequence of fine tags. A mandatory +# name is required for each 'def-mult' which may also has an optional attribute +# 'closed="true"' if it belongs to a closed list +# c is for comments and is ignored +sequence = + element sequence { attlist.sequence, (tags-item | label-item)+ } +attlist.sequence &= empty +# Element 'sequence' encloses a set of tags or labels which defines +# a unit with more than one label +label-item = element label-item { attlist.label-item, empty } +attlist.label-item &= + attribute label { text }, + attribute c { text }? +# Each 'label' of the 'label-item' correspond to a coarse tag previously +# defined as a 'def-label' by a name. +# c is for comments and is ignored +forbid = element forbid { attlist.forbid, label-sequence+ } +attlist.forbid &= empty +# Element 'forbid' contains sequences of morphological categories that are not +# allowed in a given language +label-sequence = + element label-sequence { attlist.label-sequence, label-item+ } +attlist.label-sequence &= attribute c { text }? +# Each 'label-sequence' is restricted to two 'label-items' +# c is for comments and is ignored +enforce-rules = + element enforce-rules { attlist.enforce-rules, enforce-after+ } +attlist.enforce-rules &= empty +# Element 'enforce-rules' defines sets of coarse tags that must follow specified ones +enforce-after = + element enforce-after { attlist.enforce-after, label-set } +attlist.enforce-after &= + attribute label { text }, + attribute c { text }? +# Each 'enforce-after' encloses the set of coarse tags ('label-set') that must follow +# the one defined in 'label', as a mandatory attribute +# c is for comments and is ignored +label-set = element label-set { attlist.label-set, label-item+ } +attlist.label-set &= attribute c { text }? +# The set of 'label-items' enforced after a 'label' are enclosed inside element 'label-set' +# c is for comments and is ignored +preferences = element preferences { attlist.preferences, prefer+ } +attlist.preferences &= empty +# Element 'preferences' allows to decide amongst two or more fine tag sequences +# which are grouped in the same coarse tag. +prefer = element prefer { attlist.prefer, empty } +attlist.prefer &= + attribute tags { text }, + attribute c { text }? +# Each 'prefer' element has a mandatory attribute 'tags' made of a sequence of fine tags +# c is for comments and is ignored +discard-on-ambiguity = + element discard-on-ambiguity { + attlist.discard-on-ambiguity, discard+ + } +attlist.discard-on-ambiguity &= empty +# List of label-item or tags-item to be discarded when an ambiguity +# occurs inside a word +discard = element discard { attlist.discard, empty } +attlist.discard &= + attribute tags { text }, + attribute c { text }? +start = tagger +# Each 'discard' element has a mandatory attribute 'tags' made of a sequence of fine tags +# c is for comments and is ignored Index: branches/apertium-tagger/apertium2/apertium/transfer.rnc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.rnc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.rnc (revision 69632) @@ -0,0 +1,407 @@ +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# Draft of DTD for the structural transfer rule files +# +# Sergio Ortiz, Gema RamĂ­rez-SĂĄnchez, Mireia GinestĂ­, Mikel L. Forcada, +# 2005.07.29. + +condition = + and + | or + | not + | equal + | begins-with + | begins-with-list + | ends-with + | ends-with-list + | contains-substring + | in +container = var | clip +sentence = + let + | out + | choose + | modify-case + | call-macro + | append + | reject-current-rule +value = + b + | clip + | lit + | lit-tag + | var + | get-case-from + | case-of + | concat + | lu + | mlu + | chunk +stringvalue = clip | lit | var | get-case-from | case-of +transfer = + element transfer { + attlist.transfer, + section-def-cats, + section-def-attrs?, + section-def-vars?, + section-def-lists?, + section-def-macros?, + section-rules + } +attlist.transfer &= attribute default { "lu" | "chunk" }? +# 'transfer' is the root element containing the whole structural +# transfer rule file. Attribute 'default' specifies if +# unmatched words have to be written as lexical units ("lu", this is +# the default value) or as chunks ("chunk"). +section-def-cats = + element section-def-cats { attlist.section-def-cats, def-cat+ } +attlist.section-def-cats &= empty +# The 'def-cats' section defines the categories used to build the +# patterns used in rules +def-cat = element def-cat { attlist.def-cat, cat-item+ } +attlist.def-cat &= + attribute n { xsd:ID }, + attribute c { text }? +# Each 'def-cat' defines one category in terms of a list of +# category items and has a unique name 'n', which is mandatory +cat-item = element cat-item { attlist.cat-item, empty } +attlist.cat-item &= + attribute lemma { text }?, + attribute tags { text }, + attribute c { text }? +# Each 'cat-item' (category item) represents a set of lexical forms +# and has a mandatory attribute 'tags' whose value is a sequence of +# dot-separated tag names; this sequence is a subsequence of the +# tag sequence defining each possible lexical form. For example, +# tags="n.f" would match all lexical forms containing this tag +# sequence, such as "^casa$". +# +# In addition, an optional attribute, "lemma", may be used to +# define lexical forms having a particular substring in their lemma +section-def-attrs = + element section-def-attrs { attlist.section-def-attrs, def-attr+ } +attlist.section-def-attrs &= empty +# The 'def-attrs' section defines the attributes that will be +# identified in matched lexical forms +def-attr = element def-attr { attlist.def-attr, attr-item+ } +attlist.def-attr &= + attribute n { xsd:ID }, + attribute c { text }? +# Each def-attr defines one attribute in terms of a list of +# attribute items and has a mandatory unique name n +attr-item = element attr-item { attlist.attr-item, empty } +attlist.attr-item &= + attribute tags { text }?, + attribute c { text }? +# Each 'attr-item' specifies a subsequence of the tags in +# that lexical form (attribute 'tags') +section-def-vars = + element section-def-vars { attlist.section-def-vars, def-var+ } +attlist.section-def-vars &= empty +# The 'def-vars' section defines the global variables +# that will be used to transfer information between rules +def-var = element def-var { attlist.def-var, empty } +attlist.def-var &= + attribute n { xsd:ID }, + attribute v { text }?, + attribute c { text }? +# The definition of a global variable has a mandatory unique name 'n' that +# will be used to refer to it. A value of initialization can also be specified +# by means the 'v' attribute. The default value of the initialization is the +# empty string. +section-def-lists = + element section-def-lists { attlist.section-def-lists, def-list+ } +attlist.section-def-lists &= empty +# Element 'section-def-lists' encloses a set of list definitions +def-list = element def-list { attlist.def-list, list-item+ } +attlist.def-list &= + attribute n { xsd:ID }, + attribute c { text }? +# The 'def-list' element defines a named list to search with the 'in' +# element. Attribute 'n' sets the name of the list +list-item = element list-item { attlist.list-item, empty } +attlist.list-item &= + attribute v { text }, + attribute c { text }? +# Attribute 'v' of 'list-item' element contains the value to be added to +# the list being defined +section-def-macros = + element section-def-macros { attlist.section-def-macros, def-macro+ } +attlist.section-def-macros &= empty +# +# The 'def-macros' section defines macros containing portions of +# code frequently used in the action part of rules +# +def-macro = element def-macro { attlist.def-macro, sentence+ } +attlist.def-macro &= attribute n { xsd:ID } +attlist.def-macro &= + attribute npar { text }, + attribute c { text }? +# Macro definition: +# +# A macro has a mandatory name (the value of 'n'), a number of parameters +# (the value of 'npar') and a body containing arguments and statements. +section-rules = element section-rules { attlist.section-rules, rule+ } +attlist.section-rules &= empty +# The rules section contains a sequence of one or more rules +rule = element rule { attlist.rule, pattern, action } +attlist.rule &= attribute comment { text }? +# Each rule has a pattern and an action +# * attribute 'comment' allows to put in comments about the purpose of +# the rule being defined +pattern = element pattern { attlist.pattern, pattern-item+ } +attlist.pattern &= empty +# The pattern is specified in terms of pattern items, each one +# representing a lexical form in the matched pattern +pattern-item = element pattern-item { attlist.pattern-item, empty } +attlist.pattern-item &= attribute n { xsd:IDREF } +# Each attribute to be activated is referred to by its name in the def-cats section +action = element action { attlist.action, sentence* } +attlist.action &= attribute c { text }? +# Encloses the procedural part of a rule +choose = element choose { attlist.choose, when+, otherwise? } +attlist.choose &= attribute c { text }? +# The choose statement is a selection statement (similar to a case +# statement) composed of one or more tested cases and an optional +# otherwise +when = element when { attlist.when, test, sentence* } +attlist.when &= attribute c { text }? +# Each tested case is a block of zero or more statements +otherwise = element otherwise { attlist.otherwise, sentence+ } +attlist.otherwise &= attribute c { text }? +# The otherwise case is also a block of one or more statements +test = element test { attlist.test, condition } +attlist.test &= attribute c { text }? +# The test in a tested case may be a conjunction, a disjunction, or +# a negation of simpler tests, as well as a simple equality test +and = element and { attlist.and, condition, condition+ } +attlist.and &= empty +# Each conjuntion test contains two or more simpler tests +or = element or { attlist.or, condition, condition+ } +attlist.or &= empty +# Each disjunction test contains two or more simpler tests +not = element not { attlist.not, condition } +attlist.not &= empty +# The negation of a simpler test is a test itself +equal = element equal { attlist.equal, value, value } +attlist.equal &= attribute caseless { "no" | "yes" }? +# The simplest test is an equality test. The right part and the +# left part of the equality may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with = element begins-with { attlist.begins-with, value, value } +attlist.begins-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with = element ends-with { attlist.ends-with, value, value } +attlist.ends-with &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +begins-with-list = + element begins-with-list { attlist.begins-with-list, value, \list } +attlist.begins-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the beginning. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +ends-with-list = + element ends-with-list { attlist.ends-with-list, value, \list } +attlist.ends-with-list &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part at the end. +# First parts of the test may be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. The second part +# must be always a list. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +contains-substring = + element contains-substring { + attlist.contains-substring, value, value + } +attlist.contains-substring &= attribute caseless { "no" | "yes" }? +# Tests if the left part contains the right part. +# Both parts of the test may both be a clip (see below), a +# literal string ('lit'), a literal tag ('lit-tag') or the value of +# a variable ('var') defined in the def-vars section. When the attribute +# 'caseless' is set to 'yes', the comparison is made without attending +# to the case. +in = element in { attlist.in, value, \list } +attlist.in &= attribute caseless { "no" | "yes" }? +# 'in' performs a search of a value in a list. If 'caseless' is set to yes, +# this search is performed without attending to the case +\list = element list { attlist.list, empty } +attlist.list &= attribute n { xsd:IDREF } +# 'list' refers, with the name in attribute 'n', a list defined before in +# the 'section-def-list' section +let = element let { attlist.let, container, value } +attlist.let &= empty +# An assignment statement ('let') assigns the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip +append = element append { attlist.append, value+ } +attlist.append &= attribute n { xsd:IDREF } +# This instruction appends the value of a clip (see +# below), a literal string ('lit'), a literal tag('lit-tag') or the +# value of a global variable ('var') to either a global variable ('var') +# or a clip, identified by the "n" attribute +out = element out { attlist.out, (mlu | lu | b | chunk | var)+ } +attlist.out &= attribute c { text }? +# 'out' is an output statement; it may output any sequence of +# clips, literal strings, literal tags, variables, and whitespace items +# (see below) +modify-case = + element modify-case { attlist.modify-case, container, stringvalue } +attlist.modify-case &= empty +# The first argument of 'modify-case' copy the case of the second +# argument. +call-macro = element call-macro { attlist.call-macro, with-param* } +attlist.call-macro &= attribute n { xsd:IDREF } +# A macro may be called anywhere by name with one or more +# arguments +with-param = element with-param { attlist.with-param, empty } +attlist.with-param &= attribute pos { text } +# The attribute pos in each argument is used to refer to a lexical +# form in the current rule. For example, if a 2-parameter macro +# has been defined to perform noun-adjective agreement operations, +# it may be used with arguments 1 and 2 in a noun-adjective rule, +# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with +# arguments 1 and 3 in a noun-adverb-adjective rule, and with +# arguments 2 and 1 in an adjective-noun rule +clip = element clip { attlist.clip, empty } +attlist.clip &= + attribute pos { text }, + attribute side { "sl" | "tl" }, + attribute part { text }, + attribute queue { text }?, + attribute link-to { text }?, + attribute c { text }? +# A 'clip' is a substring of a source-language or target-language +# lexical form, extracted according to an attribute: +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * 'side' is used to select a source-language ('sl') or a +# target-language ('tl') clip +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +# +# * the value of 'queue' may be 'no' or 'yes'. 'yes' is assumed by +# default. +# +# * 'link-to' causes the other attributes to be ignored in clip evaluation +# when using 'clip' as a right hand side element (as value), and +# returns its value. When using as a left hand side (as reference), +# the value of the 'as' attribute is ignored. +lit = element lit { attlist.lit, empty } +attlist.lit &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +lit-tag = element lit-tag { attlist.lit-tag, empty } +attlist.lit-tag &= attribute v { text } +# A literal string value: the value of the literal is the value of +# the 'v' attribute +var = element var { attlist.var, empty } +attlist.var &= attribute n { xsd:IDREF } +# Each 'var' is a variable identifier: the attribute n is the name +# of the variable. When it is in an 'out', a 'test', or the right +# part of a 'let', it represents the value of the variable; when in +# the left part of a 'let' it represents the reference of the +# variable. +get-case-from = + element get-case-from { attlist.get-case-from, (clip | lit | var) } +attlist.get-case-from &= attribute pos { text } +# AtenciĂłn, falta modificar todos los comentarios donde intervenga +# get-case-from +case-of = element case-of { attlist.case-of, empty } +attlist.case-of &= + attribute pos { text }, + attribute side { "sl" | "tl" }, + attribute part { text } +# A 'case-of' is a value representing the case of a "clip". This value +# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA", +# (all uppercase). +# +# * 'pos' is an index (1, 2, 3...) used to select a lexical form +# inside the rule; +# +# * 'side' is used to select a source-language ('sl') or a +# target-language ('tl') clip +# +# * the value of 'part' is the name of an attribute defined in +# def-attrs, but may take also the values 'lem' (referring to +# the lemma of the lexical form), 'lemh' (lemma head), 'lemq' +# (lemma queue) and 'whole' (referring to the whole lexical form). +concat = element concat { attlist.concat, value+ } +attlist.concat &= empty +# Concatenates a sequence of values +mlu = element mlu { attlist.mlu, lu+ } +attlist.mlu &= empty +# Encloses a multiword +lu = element lu { attlist.lu, value+ } +attlist.lu &= empty +# Encloses a word inside an 'out' element. +reject-current-rule = + element reject-current-rule { attlist.reject-current-rule, empty } +attlist.reject-current-rule &= attribute shifting { "yes" | "no" }? +# This instruction cancels the execution of the rule being processed. +# If "shifting" is set to "yes" or is not specified, the matching process +# consumes exactly one word at the input. If "shifting" is set to "no" +# then marks the rule to not to be considered in the current matching +# until the input buffer advances at least one single word +chunk = element chunk { attlist.chunk, tags, (mlu | lu | b | var)+ } +attlist.chunk &= + attribute name { text }?, + attribute namefrom { text }?, + attribute case { text }?, + attribute c { text }? +# Encloses a chunk inside an 'out' element. +# * 'name' the pseudolemma of the chunk. +# * 'namefrom' get the name from a variable. +# * 'case' the variable to get the uppercase/lowercase policy +# to apply it to the chunk name +tags = element tags { attlist.tags, tag+ } +attlist.tags &= empty +tag = element tag { attlist.tag, value } +attlist.tag &= empty +b = element b { attlist.b, empty } +attlist.b &= attribute pos { text }? +start = transfer +# 'b' is a [super]blanks item, indexed by pos; for example, a 'b' +# with pos="2" refers to the [super]blanks (including format data +# encapsulated by the de-formatter) between lexical form 2 and +# lexical form 3. Managing [super]blanks explicitly allows for the +# correct placement of format when the result of structural +# transfer has more or less lexical items than the original or has +# been reordered in some way. If attribute "pos" is not specified, then +# a single blank (ASCII 32) is generated. Index: branches/apertium-tagger/apertium2/apertium/transfer.rng =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.rng (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.rng (revision 69632) @@ -0,0 +1,1104 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lu + chunk + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + no + yes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + sl + tl + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + sl + tl + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + yes + no + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/apertium-transfer.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-transfer.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-transfer.1 (revision 69632) @@ -0,0 +1,80 @@ +.TH apertium-transfer 1 2006-03-08 "" "" +.SH NAME +apertium-transfer \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-transfer +[\-n] trules preproc biltrans [input [output]] +.PP +.B apertium-transfer +trules preproc [input [output]] +.PP +.B apertium-transfer +\-x extended trules preproc biltrans [input [output]] +.PP +.B apertium-transfer +\-c trules preproc biltrans [input [output]] +.PP +.B apertium-transfer +\-t trules preproc biltrans [input [output]] +.SH DESCRIPTION +.BR apertium-transfer +is the program that performs the transfer from input language +into output language. Normally this program will not be used independently, but in combination with other programs: +.PP +.RE +.SH FILES +These are the five files that can be used with this command: +.PP +.B trules +Transfer rules file +.PP +.B preproc +Result of preprocess trules file +.PP +.B biltrans +Bilingual letter transducer file +.PP +.B infile +Input file (stdin by default). +.PP +.B outfile +Output file (stdout by default). +.PP +\-.B \-b +\-input from lexical transfer (single level transfer only) +\-.PP +\-.B \-h +\-shows this message +\-.PP +.B -n +Do not use a bilingual dictionary to process the input. +.PP +.B -x bindix +extended mode with user dictionary +.PP +.B -c +case-sensitiveness while accessing bilingual dictionary +.PP +.B -t +trace mode: show rule numbers and matched content +.PP +.B -T +extended trace mode, for use with apertium-transfer-tools +.PP +.B -z +null-flushing output on +.PP +.SH SEE ALSO +.I apertium \fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/transfer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.cc (revision 69632) @@ -0,0 +1,2346 @@ +/* + * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace Apertium; +using namespace std; + +void +Transfer::destroy() +{ + if(me) + { + delete me; + me = NULL; + } + if(doc) + { + xmlFreeDoc(doc); + doc = NULL; + } +} + +Transfer::Transfer() : +word(0), +blank(0), +lword(0), +lblank(0), +output(0), +any_char(0), +any_tag(0), +nwords(0) +{ + me = NULL; + doc = NULL; + root_element = NULL; + lastrule = NULL; + defaultAttrs = lu; + useBilingual = true; + preBilingual = false; + isExtended = false; + null_flush = false; + internal_null_flush = false; + trace = false; + trace_att = false; + emptyblank = ""; +} + +Transfer::~Transfer() +{ + destroy(); +} + +void +Transfer::readData(FILE *in) +{ + alphabet.read(in); + any_char = alphabet(TRXReader::ANY_CHAR); + any_tag = alphabet(TRXReader::ANY_TAG); + + Transducer t; + t.read(in, alphabet.size()); + + map finals; + + // finals + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + int key = Compression::multibyte_read(in); + finals[key] = Compression::multibyte_read(in); + } + + me = new MatchExe(t, finals); + + // attr_items + bool recompile_attrs = Compression::string_read(in) != string(pcre_version()); + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + attr_items[cad_k].read(in); + wstring fallback = Compression::wstring_read(in); + if(recompile_attrs) { + attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + } + } + + // variables + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + } + + // macros + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + macros[cad_k] = Compression::multibyte_read(in); + } + + // lists + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + + for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) + { + wstring const cad_v = Compression::wstring_read(in); + lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); + listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + } + } +} + +void +Transfer::readBil(string const &fstfile) +{ + FILE *in = fopen(fstfile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open file '" << fstfile << "'." << endl; + exit(EXIT_FAILURE); + } + fstp.load(in); + fstp.initBiltrans(); + fclose(in); +} + +void +Transfer::setExtendedDictionary(string const &fstfile) +{ + FILE *in = fopen(fstfile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open extended dictionary file '" << fstfile << "'." << endl; + exit(EXIT_FAILURE); + } + extended.load(in); + extended.initBiltrans(); + fclose(in); + isExtended = true; +} + +void +Transfer::read(string const &transferfile, string const &datafile, + string const &fstfile) +{ + readTransfer(transferfile); + + // datafile + FILE *in = fopen(datafile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open file '" << datafile << "'." << endl; + exit(EXIT_FAILURE); + } + readData(in); + fclose(in); + + if(fstfile != "") + { + readBil(fstfile); + } +} + +void +Transfer::readTransfer(string const &in) +{ + doc = xmlReadFile(in.c_str(), NULL, 0); + + if(doc == NULL) + { + cerr << "Error: Could not parse file '" << in << "'." << endl; + exit(EXIT_FAILURE); + } + + root_element = xmlDocGetRootElement(doc); + + // search for root element attributes + for(xmlAttr *i = root_element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "default")) + { + if(!xmlStrcmp(i->children->content, (const xmlChar *) "chunk")) + { + defaultAttrs = chunk; + } + else + { + defaultAttrs = lu; // default value for 'default' + } + } + } + + // search for macros & rules + for(xmlNode *i = root_element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) + { + collectMacros(i); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) + { + collectRules(i); + } + } + } +} + +void +Transfer::collectRules(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + for(xmlNode *j = i->children; ; j = j->next) + { + if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "action")) + { + rule_map.push_back(j); + break; + } + } + } + } +} + +void +Transfer::collectMacros(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + macro_map.push_back(i); + } + } +} + +bool +Transfer::checkIndex(xmlNode *element, int index, int limit) +{ + if(index >= limit) + { + wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) <line << endl; + return false; + } + return true; +} + + +string +Transfer::evalString(xmlNode *element) +{ + map::iterator it; + it = evalStringCache.find(element); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_clip_sl: + if(checkIndex(element, ti.getPos(), lword)) + { + return word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()); + } + break; + + case ti_clip_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()); + } + break; + + case ti_linkto_sl: + if(checkIndex(element, ti.getPos(), lword)) + { + if(word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()) != "") + { + return "<" + string((char *) ti.getPointer()) + ">"; + } + else + { + return ""; + } + } + break; + + case ti_linkto_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + if(word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()) != "") + { + return "<" + string((char *) ti.getPointer()) + ">"; + } + else + { + return ""; + } + } + break; + + case ti_var: + return variables[ti.getContent()]; + + case ti_lit_tag: + case ti_lit: + return ti.getContent(); + + case ti_b: + if(checkIndex(element, ti.getPos(), lblank)) + { + if(ti.getPos() >= 0) + { + return !blank?"":*(blank[ti.getPos()]); + } + return " "; + } + break; + + case ti_get_case_from: + if(checkIndex(element, ti.getPos(), lword)) + { + return copycase(word[ti.getPos()]->source(attr_items[ti.getContent()]), + evalString((xmlNode *) ti.getPointer())); + } + break; + + case ti_case_of_sl: + if(checkIndex(element, ti.getPos(), lword)) + { + return caseOf(word[ti.getPos()]->source(attr_items[ti.getContent()])); + } + break; + + case ti_case_of_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return caseOf(word[ti.getPos()]->target(attr_items[ti.getContent()])); + } + break; + + default: + return ""; + } + return ""; + } + + if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL, *side = NULL, *as = NULL; + bool queue = true; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) + { + side = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *)i->children->content) - 1; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) + { + if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) + { + queue = false; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) + { + as = i->children->content; + } + } + + if(as != NULL) + { + if(!xmlStrcmp(side, (const xmlChar *) "sl")) + { + evalStringCache[element] = TransferInstr(ti_linkto_sl, (const char *) part, pos, (void *) as, queue); + } + else + { + evalStringCache[element] = TransferInstr(ti_linkto_tl, (const char *) part, pos, (void *) as, queue); + } + } + else if(!xmlStrcmp(side, (const xmlChar *) "sl")) + { + evalStringCache[element] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue); + } + else + { + evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL, queue); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) + { + evalStringCache[element] = TransferInstr(ti_lit_tag, + tags((const char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) + { + evalStringCache[element] = TransferInstr(ti_lit, string((char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) + { + if(element->properties == NULL) + { + evalStringCache[element] = TransferInstr(ti_b, " ", -1); + } + else + { + int pos = atoi((const char *) element->properties->children->content) - 1; + evalStringCache[element] = TransferInstr(ti_b, "", pos); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) + { + int pos = atoi((const char *) element->properties->children->content) - 1; + xmlNode *param = NULL; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + param = i; + break; + } + } + + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) + { + evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) + { + int pos = 0; + xmlChar *part = NULL, *side = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) + { + side = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + } + + if(!xmlStrcmp(side, (const xmlChar *) "sl")) + { + evalStringCache[element] = TransferInstr(ti_case_of_sl, (const char *) part, pos); + } + else + { + evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) + { + string value; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + value.append(evalString(i)); + } + } + return value; + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + myword.append(evalString(i)); + } + } + + if(myword != "") + { + return "^"+myword+"$"; + } + else + { + return ""; + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu")) + { + string value; + + bool first_time = true; + + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + string myword; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + + if(!first_time) + { + if(myword != "" && myword[0] != '#') //'+#' problem + { + value.append("+"); + } + } + else + { + if(myword != "") + { + first_time = false; + } + } + + value.append(myword); + } + } + + if(value != "") + { + return "^"+value+"$"; + } + else + { + return ""; + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "chunk")) + { + return processChunk(element); + } + else + { + cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + } + + return evalString(element); +} + +void +Transfer::processOut(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(defaultAttrs == lu) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + if(myword != "") + { + fputwc_unlocked(L'^', output); + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); + fputwc_unlocked(L'$', output); + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) + { + fputwc_unlocked('^', output); + bool first_time = true; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + string myword; + for(xmlNode *k = j->children; k != NULL; k = k->next) + { + if(k->type == XML_ELEMENT_NODE) + { + myword.append(evalString(k)); + } + } + + if(!first_time) + { + if(myword != "" && myword[0] != '#') //'+#' problem + { + fputwc_unlocked(L'+', output); + } + } + else + { + if(myword != "") + { + first_time = false; + } + } + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); + } + } + fputwc_unlocked(L'$', output); + } + else // 'b' + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), + output); + } + } + else + { + if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) + { + fputws_unlocked(UtfConverter::fromUtf8(processChunk(i)).c_str(), output); + } + else // 'b' + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); + } + } + } + } +} + +string +Transfer::processChunk(xmlNode *localroot) +{ + string name, namefrom; + string caseofchunk = "aa"; + string result; + + + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "name")) + { + name = (const char *) i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "namefrom")) + { + namefrom = (const char *) i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "case")) + { + caseofchunk = (const char *) i->children->content; + } + } + + result.append("^"); + if(caseofchunk != "") + { + if(name != "") + { + result.append(copycase(variables[caseofchunk], name)); + } + else if(namefrom != "") + { + result.append(copycase(variables[caseofchunk], variables[namefrom])); + } + else + { + cerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; + exit(EXIT_FAILURE); + } + } + else + { + if(name != "") + { + result.append(name); + } + else if(namefrom != "") + { + result.append(variables[namefrom]); + } + else + { + cerr << "Error: you must specify either 'name' or 'namefrom' for the 'chunk' element" << endl; + exit(EXIT_FAILURE); + } + } + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "tags")) + { + result.append(processTags(i)); + result.append("{"); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + if(myword != "") + { + result.append("^"); + result.append(myword); + result.append("$"); + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) + { + bool first_time = true; + string myword; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + string mylocalword; + if(j->type == XML_ELEMENT_NODE) + { + for(xmlNode *k = j->children; k != NULL; k = k->next) + { + if(k->type == XML_ELEMENT_NODE) + { + mylocalword.append(evalString(k)); + } + } + + if(!first_time) + { + if(mylocalword != "" && mylocalword[0] != '#') // '+#' problem + { + myword += '+'; + } + } + else + { + first_time = false; + } + } + myword.append(mylocalword); + } + if(myword != "") + { + result.append("^"); + result.append(myword); + result.append("$"); + } + } + else // 'b' + { + result.append(evalString(i)); + } + } + } + result.append("}$"); + return result; +} + +string +Transfer::processTags(xmlNode *localroot) +{ + string result; + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (xmlChar const *) "tag")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + result.append(evalString(j)); + } + } + } + } + } + return result; +} + +int +Transfer::processInstruction(xmlNode *localroot) +{ + int words_to_consume = -1; + if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) + { + words_to_consume = processChoose(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) + { + processLet(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) + { + processAppend(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) + { + processOut(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) + { + processCallMacro(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) + { + processModifyCase(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "reject-current-rule")) + { + words_to_consume = processRejectCurrentRule(localroot); + } + return words_to_consume; +} + +int +Transfer::processRejectCurrentRule(xmlNode *localroot) +{ + bool shifting = true; + string value; + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "shifting")) + { + value = (char *) i->children->content; + break; + } + } + + if(value == "no") + { + shifting = false; + } + + return shifting ? 1 : 0; +} + +void +Transfer::processLet(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + map::iterator it = evalStringCache.find(leftSide); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_var: + variables[ti.getContent()] = evalString(rightSide); + return; + + case ti_clip_sl: + word[ti.getPos()]->setSource(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); + return; + + case ti_clip_tl: + word[ti.getPos()]->setTarget(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition()); + return; + + default: + return; + } + } + if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = evalString(rightSide); + evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL, *side = NULL, *as = NULL; + bool queue = true; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) + { + side = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) + { + if(!xmlStrcmp(i->children->content, (const xmlChar *) "no")) + { + queue = false; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) + { + as = i->children->content; + } + } + + if(!xmlStrcmp(side, (const xmlChar *) "tl")) + { + word[pos]->setTarget(attr_items[(const char *) part], evalString(rightSide), queue); + evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL, queue); + } + else + { + word[pos]->setSource(attr_items[(const char *) part], evalString(rightSide), queue); + evalStringCache[leftSide] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue); + } + } +} + +void +Transfer::processAppend(xmlNode *localroot) +{ + string name; + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "n")) + { + name = (char *) i->children->content; + break; + } + } + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + variables[name].append(evalString(i)); + } + } +} + +void +Transfer::processModifyCase(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL, *side = NULL, *as = NULL; + bool queue = true; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "side")) + { + side = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "queue")) + { + if(!xmlStrcmp(i->children->content, (xmlChar const *) "no")) + { + queue = false; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to")) + { + as = i->children->content; + (void)as; // ToDo, remove "as" and the whole else? + } + } + if(!xmlStrcmp(side, (const xmlChar *) "sl")) + { + string const result = copycase(evalString(rightSide), + word[pos]->source(attr_items[(const char *) part], queue)); + word[pos]->setSource(attr_items[(const char *) part], result); + } + else + { + string const result = copycase(evalString(rightSide), + word[pos]->target(attr_items[(const char *) part], queue)); + word[pos]->setTarget(attr_items[(const char *) part], result); + } + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = copycase(evalString(rightSide), variables[val]); + } +} + +void +Transfer::processCallMacro(xmlNode *localroot) +{ + string const n = (const char *) localroot->properties->children->content; + int npar = 0; + + xmlNode *macro = macro_map[macros[n]]; + + for(xmlAttr *i = macro->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "npar")) + { + npar = atoi((const char *) i->children->content); + break; + } + } + + // ToDo: Is it at all valid if npar <= 0 ? + + TransferWord **myword = NULL; + if(npar > 0) + { + myword = new TransferWord *[npar]; + } + string **myblank = NULL; + if(npar > 0) + { + myblank = new string *[npar]; + myblank[npar-1] = &emptyblank; + } + + int idx = 0; + int lastpos = 0; + for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + int pos = atoi((const char *) i->properties->children->content)-1; + myword[idx] = word[pos]; + if(idx-1 >= 0) + { + myblank[idx-1] = blank[lastpos]; + } + idx++; + lastpos = pos; + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + for(xmlNode *i = macro->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + processInstruction(i); + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + delete[] myword; + delete[] myblank; +} + +int +Transfer::processChoose(xmlNode *localroot) +{ + int words_to_consume = -1; + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "when")) + { + bool picked_option = false; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(j->name, (const xmlChar *) "test")) + { + if(!processTest(j)) + { + break; + } + else + { + picked_option = true; + } + } + else + { + words_to_consume = processInstruction(j); + if(words_to_consume != -1) + { + return words_to_consume; + } + } + } + } + if(picked_option) + { + return words_to_consume; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + words_to_consume = processInstruction(j); + if(words_to_consume != -1) + { + return words_to_consume; + } + } + } + } + } + } + return words_to_consume; +} + +bool +Transfer::processLogical(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) + { + return processEqual(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) + { + return processBeginsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) + { + return processBeginsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) + { + return processEndsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) + { + return processEndsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) + { + return processContainsSubstring(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + { + return processOr(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + { + return processAnd(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) + { + return processNot(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) + { + return processIn(localroot); + } + + return false; +} + +bool +Transfer::processIn(xmlNode *localroot) +{ + xmlNode *value = NULL; + xmlChar *idlist = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(value == NULL) + { + value = i; + } + else + { + idlist = i->properties->children->content; + break; + } + } + } + + string sval = evalString(value); + + if(localroot->properties != NULL) + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + set &myset = listslow[(const char *) idlist]; + if(myset.find(tolower(sval)) != myset.end()) + { + return true; + } + else + { + return false; + } + } + } + + set &myset = lists[(const char *) idlist]; + if(myset.find(sval) != myset.end()) + { + return true; + } + else + { + return false; + } +} + +bool +Transfer::processTest(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return processLogical(i); + } + } + return false; +} + +bool +Transfer::processAnd(xmlNode *localroot) +{ + bool val = true; + for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val && processLogical(i); + } + } + + return val; +} + +bool +Transfer::processOr(xmlNode *localroot) +{ + bool val = false; + for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val || processLogical(i); + } + } + + return val; +} + +bool +Transfer::processNot(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return !processLogical(i); + } + } + return false; +} + +bool +Transfer::processEqual(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first) == evalString(second); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)) == tolower(evalString(second)); + } + else + { + return evalString(first) == evalString(second); + } + } +} + +bool +Transfer::beginsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = 0; i != limit; i++) + { + if(s1[i] != s2[i]) + { + return false; + } + } + + return true; +} + +bool +Transfer::endsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) + { + if(s1[j] != s2[i]) + { + return false; + } + } + + return true; +} + + +bool +Transfer::processBeginsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return beginsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return beginsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return beginsWith(evalString(first), evalString(second)); + } + } +} + +bool +Transfer::processEndsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return endsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return endsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return endsWith(evalString(first), evalString(second)); + } + } +} + +bool +Transfer::processBeginsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(beginsWith(needle, *it)) + { + return true; + } + } + return false; +} + + +bool +Transfer::processEndsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(endsWith(needle, *it)) + { + return true; + } + } + return false; +} + +bool +Transfer::processContainsSubstring(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first).find(evalString(second)) != string::npos; + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; + } + else + { + return evalString(first).find(evalString(second)) != string::npos; + } + } +} + +string +Transfer::copycase(string const &source_word, string const &target_word) +{ + wstring result; + wstring const s_word = UtfConverter::fromUtf8(source_word); + wstring const t_word = UtfConverter::fromUtf8(target_word); + + bool firstupper = iswupper(s_word[0]); + bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); + bool sizeone = s_word.size() == 1; + + if(!uppercase || (sizeone && uppercase)) + { + result = t_word; + result[0] = towlower(result[0]); + //result = StringUtils::tolower(t_word); + } + else + { + result = StringUtils::toupper(t_word); + } + + if(firstupper) + { + result[0] = towupper(result[0]); + } + + return UtfConverter::toUtf8(result); +} + +string +Transfer::caseOf(string const &str) +{ + wstring const s = UtfConverter::fromUtf8(str); + + if(s.size() > 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else if(!iswupper(s[s.size()-1])) + { + return "Aa"; + } + else + { + return "AA"; + } + } + else if(s.size() == 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else + { + return "Aa"; + } + } + else + { + return "aa"; + } +} + +string +Transfer::tolower(string const &str) const +{ + return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); +} + +string +Transfer::tags(string const &str) const +{ + string result = "<"; + + for(unsigned int i = 0, limit = str.size(); i != limit; i++) + { + if(str[i] == '.') + { + result.append("><"); + } + else + { + result += str[i]; + } + } + + result += '>'; + + return result; +} + +int +Transfer::processRule(xmlNode *localroot) +{ + int instruction_return, words_to_consume = -1; + // localroot is suposed to be an 'action' tag + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + instruction_return = processInstruction(i); + // When an instruction which modifies the number of words to be consumed + // from the input is found, execution of the rule is stopped + if(instruction_return != -1) + { + words_to_consume = instruction_return; + break; + } + } + } + return words_to_consume; +} + +TransferToken & +Transfer::readToken(FILE *in) +{ + if(!input_buffer.isEmpty()) + { + return input_buffer.next(); + } + + wstring content; + while(true) + { + int val = fgetwc_unlocked(in); + if(feof(in) || (val == 0 && internal_null_flush)) + { + return input_buffer.add(TransferToken(content, tt_eof)); + } + if(val == '\\') + { + content += L'\\'; + content += (wchar_t) fgetwc_unlocked(in); + } + else if(val == L'[') + { + content += L'['; + while(true) + { + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L']') + { + content += L']'; + break; + } + else + { + content += wchar_t(val2); + } + } + } + else if(val == L'$') + { + return input_buffer.add(TransferToken(content, tt_word)); + } + else if(val == L'^') + { + return input_buffer.add(TransferToken(content, tt_blank)); + } + else if(val == L'\0' && null_flush) + { + fflush(output); + } + else + { + content += wchar_t(val); + } + } +} + +bool +Transfer::getNullFlush(void) +{ + return null_flush; +} + +void +Transfer::setNullFlush(bool null_flush) +{ + this->null_flush = null_flush; +} + +void +Transfer::setTrace(bool trace) +{ + this->trace = trace; +} + +void +Transfer::setTraceATT(bool trace) +{ + this->trace_att = trace; +} + +void +Transfer::transfer_wrapper_null_flush(FILE *in, FILE *out) +{ + null_flush = false; + internal_null_flush = true; + + while(!feof(in)) + { + transfer(in, out); + fputwc_unlocked(L'\0', out); + int code = fflush(out); + if(code != 0) + { + wcerr << L"Could not flush output " << errno << endl; + } + } + + internal_null_flush = false; + null_flush = true; +} + +void +Transfer::transfer(FILE *in, FILE *out) +{ + if(getNullFlush()) + { + transfer_wrapper_null_flush(in, out); + } + + int last = 0; + int prev_last = 0; + int lastrule_id = -1; + set banned_rules; + + output = out; + ms.init(me->getInitial()); + + while(true) + { + if(trace_att) + { + cerr << "Loop start " << endl; + cerr << "ms.size: " << ms.size() << endl; + + cerr << "tmpword.size(): " << tmpword.size() << endl; + for (unsigned int ind = 0; ind < tmpword.size(); ind++) + { + if(ind != 0) + { + wcerr << L" "; + } + wcerr << *tmpword[ind]; + } + wcerr << endl; + + cerr << "tmpblank.size(): " << tmpblank.size() << endl; + for (unsigned int ind = 0; ind < tmpblank.size(); ind++) + { + wcerr << L"'"; + wcerr << *tmpblank[ind]; + wcerr << L"' "; + } + wcerr << endl; + + cerr << "last: " << last << endl; + cerr << "prev_last: " << prev_last << endl << endl; + } + + if(ms.size() == 0) + { + if(lastrule != NULL) + { + int num_words_to_consume = applyRule(); + + if(trace_att) + { + cerr << "num_words_to_consume: " << num_words_to_consume << endl; + } + + //Consume all the words from the input which matched the rule. + //This piece of code is executed unless the rule contains a "reject-current-rule" instruction + if(num_words_to_consume < 0) + { + banned_rules.clear(); + input_buffer.setPos(last); + } + else if(num_words_to_consume > 0) + { + banned_rules.clear(); + if(prev_last >= input_buffer.getSize()) + { + input_buffer.setPos(0); + } + else + { + input_buffer.setPos(prev_last+1); + } + int num_consumed_words = 0; + while(num_consumed_words < num_words_to_consume) + { + TransferToken& local_tt = input_buffer.next(); + if (local_tt.getType() == tt_word) + { + num_consumed_words++; + } + } + } + else + { + //Add rule to banned rules + banned_rules.insert(lastrule_id); + input_buffer.setPos(prev_last); + input_buffer.next(); + last = input_buffer.getPos(); + } + lastrule_id = -1; + } + else + { + if(tmpword.size() != 0) + { + if(trace_att) + { + cerr << "printing tmpword[0]" < tr; + if(useBilingual && preBilingual == false) + { + if(isExtended && (*tmpword[0])[0] == L'*') + { + tr = extended.biltransWithQueue((*tmpword[0]).substr(1), false); + if(tr.first[0] == L'@') + { + tr.first[0] = L'*'; + } + else + { + tr.first = L"%" + tr.first; + } + } + else + { + tr = fstp.biltransWithQueue(*tmpword[0], false); + } + } + else if(preBilingual) + { + wstring sl; + wstring tl; + int seenSlash = 0; + for(wstring::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++) + { + if(*it == L'\\') + { + if(seenSlash == 0) + { + sl.push_back(*it); + it++; + sl.push_back(*it); + } + else + { + tl.push_back(*it); + it++; + tl.push_back(*it); + } + continue; + } + else if(*it == L'/') + { + seenSlash++; + continue; + } + if(seenSlash == 0) + { + sl.push_back(*it); + } + else if(seenSlash == 1) + { + tl.push_back(*it); + } + else if(seenSlash > 1) + { + break; + } + } + //tmpword[0]->assign(sl); + tr = pair(tl, false); + //wcerr << L"pb: " << *tmpword[0] << L" :: " << sl << L" >> " << tl << endl ; + } + else + { + tr = pair(*tmpword[0], 0); + } + + if(tr.first.size() != 0) + { + if(defaultAttrs == lu) + { + fputwc_unlocked(L'^', output); + fputws_unlocked(tr.first.c_str(), output); + fputwc_unlocked(L'$', output); + } + else + { + if(tr.first[0] == '*') + { + fputws_unlocked(L"^unknown{^", output); + } + else + { + fputws_unlocked(L"^default{^", output); + } + fputws_unlocked(tr.first.c_str(), output); + fputws_unlocked(L"$}$", output); + } + } + banned_rules.clear(); + tmpword.clear(); + input_buffer.setPos(last); + input_buffer.next(); + prev_last = last; + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + else if(tmpblank.size() != 0) + { + if(trace_att) + { + cerr << "printing tmpblank[0]" <c_str(), output); + tmpblank.clear(); + prev_last = last; + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + } + } + int val = ms.classifyFinals(me->getFinals(), banned_rules); + if(val != -1) + { + lastrule = rule_map[val-1]; + lastrule_id = val; + last = input_buffer.getPos(); + + if(trace) + { + wcerr << endl << L"apertium-transfer: Rule " << val << L" "; + for (unsigned int ind = 0; ind < tmpword.size(); ind++) + { + if (ind != 0) + { + wcerr << L" "; + } + wcerr << *tmpword[ind]; + } + wcerr << endl; + } + } + + TransferToken ¤t = readToken(in); + + switch(current.getType()) + { + case tt_word: + applyWord(current.getContent()); + tmpword.push_back(¤t.getContent()); + break; + + case tt_blank: + ms.step(L' '); + tmpblank.push_back(¤t.getContent()); + break; + + case tt_eof: + if(tmpword.size() != 0) + { + tmpblank.push_back(¤t.getContent()); + ms.clear(); + } + else + { + fputws_unlocked(current.getContent().c_str(), output); + return; + } + break; + + default: + cerr << "Error: Unknown input token." << endl; + return; + } + } +} + +int +Transfer::applyRule() +{ + int words_to_consume; + unsigned int limit = tmpword.size(); + //wcerr << L"applyRule: " << tmpword.size() << endl; + + for(unsigned int i = 0; i != limit; i++) + { + if(i == 0) + { + word = new TransferWord *[limit]; + lword = limit; + if(limit != 1) + { + blank = new string *[limit - 1]; + lblank = limit - 1; + } + else + { + blank = NULL; + lblank = 0; + } + } + else + { + blank[i-1] = new string(UtfConverter::toUtf8(*tmpblank[i-1])); + } + + pair tr; + if(useBilingual && preBilingual == false) + { + tr = fstp.biltransWithQueue(*tmpword[i], false); + } + else if(preBilingual) + { + //wcerr << "applyRule: " << *tmpword[i] << endl; + wstring sl; + wstring tl; + int seenSlash = 0; + for(wstring::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++) + { + if(*it == L'\\') + { + if(seenSlash == 0) + { + sl.push_back(*it); + it++; + sl.push_back(*it); + } + else + { + tl.push_back(*it); + it++; + tl.push_back(*it); + } + continue; + } + + if(*it == L'/') + { + seenSlash++; + continue; + } + if(seenSlash == 0) + { + sl.push_back(*it); + } + else if(seenSlash == 1) + { + tl.push_back(*it); + } + else if(seenSlash > 1) + { + break; + } + } + //tmpword[i]->assign(sl); + tr = pair(tl, false); + } + else + { + tr = pair(*tmpword[i], false); + } + + word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]), + UtfConverter::toUtf8(tr.first), tr.second); + } + + words_to_consume = processRule(lastrule); + lastrule = NULL; + + if(word) + { + for(unsigned int i = 0; i != limit; i++) + { + delete word[i]; + } + delete[] word; + } + if(blank) + { + for(unsigned int i = 0; i != limit - 1; i++) + { + delete blank[i]; + } + delete[] blank; + } + word = NULL; + blank = NULL; + tmpword.clear(); + tmpblank.clear(); + ms.init(me->getInitial()); + return words_to_consume; +} + +/* HERE */ +void +Transfer::applyWord(wstring const &word_str) +{ + ms.step(L'^'); + for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) + { + switch(word_str[i]) + { + case L'\\': + i++; + ms.step(towlower(word_str[i]), any_char); + break; + + case L'/': + i = limit; + break; + + case L'<': + for(unsigned int j = i+1; j != limit; j++) + { + if(word_str[j] == L'>') + { + int symbol = alphabet(word_str.substr(i, j-i+1)); + if(symbol) + { + ms.step(symbol, any_tag); + } + else + { + ms.step(any_tag); + } + i = j; + break; + } + } + break; + + default: + ms.step(towlower(word_str[i]), any_char); + break; + } + } + ms.step(L'$'); +} + +void +Transfer::setPreBilingual(bool value) +{ + preBilingual = value; +} + +bool +Transfer::getPreBilingual(void) const +{ + return preBilingual; +} + +void +Transfer::setUseBilingual(bool value) +{ + useBilingual = value; +} + +bool +Transfer::getUseBilingual(void) const +{ + return useBilingual; +} + +void +Transfer::setCaseSensitiveness(bool value) +{ + fstp.setCaseSensitiveMode(value); +} Index: branches/apertium-tagger/apertium2/apertium/transfer.dtd =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.dtd (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.dtd (revision 69632) @@ -0,0 +1,489 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/transfer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer.h (revision 69632) @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TRANSFER_ +#define _TRANSFER_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +class Transfer +{ +private: + + Alphabet alphabet; + MatchExe *me; + MatchState ms; + map attr_items; + map variables; + map macros; + map, Ltstr> lists; + map, Ltstr> listslow; + vector macro_map; + vector rule_map; + xmlDoc *doc; + xmlNode *root_element; + TransferWord **word; + string **blank; + int lword, lblank; + Buffer input_buffer; + vector tmpword; + vector tmpblank; + + FSTProcessor fstp; + FSTProcessor extended; + bool isExtended; + FILE *output; + int any_char; + int any_tag; + + xmlNode *lastrule; + unsigned int nwords; + + map evalStringCache; + + enum OutputType{lu,chunk}; + + OutputType defaultAttrs; + bool preBilingual; + bool useBilingual; + bool null_flush; + bool internal_null_flush; + bool trace; + bool trace_att; + string emptyblank; + + void destroy(); + void readData(FILE *input); + void readBil(string const &filename); + void readTransfer(string const &input); + void collectMacros(xmlNode *localroot); + void collectRules(xmlNode *localroot); + string caseOf(string const &str); + string copycase(string const &source_word, string const &target_word); + + void processLet(xmlNode *localroot); + void processAppend(xmlNode *localroot); + int processRejectCurrentRule(xmlNode *localroot); + void processOut(xmlNode *localroot); + void processCallMacro(xmlNode *localroot); + void processModifyCase(xmlNode *localroot); + bool processLogical(xmlNode *localroot); + bool processTest(xmlNode *localroot); + bool processAnd(xmlNode *localroot); + bool processOr(xmlNode *localroot); + bool processEqual(xmlNode *localroot); + bool processBeginsWith(xmlNode *localroot); + bool processBeginsWithList(xmlNode *localroot); + bool processEndsWith(xmlNode *localroot); + bool processEndsWithList(xmlNode *local); + bool processContainsSubstring(xmlNode *localroot); + bool processNot(xmlNode *localroot); + bool processIn(xmlNode *localroot); + int processRule(xmlNode *localroot); + string evalString(xmlNode *localroot); + int processInstruction(xmlNode *localroot); + int processChoose(xmlNode *localroot); + string processChunk(xmlNode *localroot); + string processTags(xmlNode *localroot); + + bool beginsWith(string const &str1, string const &str2) const; + bool endsWith(string const &str1, string const &str2) const; + string tolower(string const &str) const; + string tags(string const &str) const; + wstring readWord(FILE *in); + wstring readBlank(FILE *in); + wstring readUntil(FILE *in, int const symbol) const; + void applyWord(wstring const &word_str); + int applyRule(); + TransferToken & readToken(FILE *in); + bool checkIndex(xmlNode *element, int index, int limit); + void transfer_wrapper_null_flush(FILE *in, FILE *out); +public: + Transfer(); + ~Transfer(); + + void read(string const &transferfile, string const &datafile, + string const &fstfile = ""); + void transfer(FILE *in, FILE *out); + void setUseBilingual(bool value); + bool getUseBilingual(void) const; + void setPreBilingual(bool value); + bool getPreBilingual(void) const; + void setExtendedDictionary(string const &fstfile); + void setCaseSensitiveness(bool value); + bool getNullFlush(void); + void setNullFlush(bool null_flush); + void setTrace(bool trace); + void setTraceATT(bool trace); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/a.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/a.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/a.cc (revision 69632) @@ -0,0 +1,50 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "a.h" + +#include "analysis.h" +#include "exception.h" + +namespace Apertium { +bool operator==(const a &a_, const a &b_) { + return a_.TheTags == b_.TheTags && a_.TheMorphemes == b_.TheMorphemes; +} + +bool operator<(const a &a_, const a &b_) { + if (a_.TheTags == b_.TheTags) + return a_.TheMorphemes < b_.TheMorphemes; + + return a_.TheTags < b_.TheTags; +} + +a::a() : TheTags(), TheMorphemes() {} + +a::a(const Analysis &Analysis_) : TheTags(), TheMorphemes() { + if (Analysis_.TheMorphemes.empty()) + throw Exception::Analysis::TheMorphemes_empty("can't convert const " + "Analysis & comprising empty " + "Morpheme std::vector to a"); + + if (Analysis_.TheMorphemes.front().TheTags.empty()) + throw Exception::Morpheme::TheTags_empty("can't convert const Analysis & " + "comprising Morpheme comprising " + "empty Tag std::vector to a"); + + TheTags = Analysis_.TheMorphemes.front().TheTags; + TheMorphemes = std::vector(Analysis_.TheMorphemes.begin() + 1, + Analysis_.TheMorphemes.end()); +} +} Index: branches/apertium-tagger/apertium2/apertium/a.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/a.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/a.h (revision 69632) @@ -0,0 +1,37 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef A_H +#define A_H + +#include "analysis.h" +#include "morpheme.h" +#include "tag.h" + +#include + +namespace Apertium { +class a { +public: + friend bool operator==(const a &a_, const a &b_); + friend bool operator<(const a &a_, const a &b_); + a(); + a(const Analysis &Analysis_); + std::vector TheTags; + std::vector TheMorphemes; +}; +} + +#endif // A_H Index: branches/apertium-tagger/apertium2/apertium/align.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/align.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/align.cc (revision 69632) @@ -0,0 +1,56 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "align.h" + +#include "linebreak.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +void align::align_( + const std::vector > &string_) { + const std::streamsize width_ = col(string_) + 2; + + for (std::vector >::const_iterator i_ = + string_.begin(); + i_ != string_.end(); ++i_) { + std::cerr << " " << std::setw(width_) << std::left << i_->first + << std::setw(0) + << linebreak::linebreak_(i_->second, width_ + 2, width_ + 4) + << '\n'; + } +} + +std::string::size_type +align::col(const std::vector > &string_) { + std::string::size_type col_ = 0; + + for (std::vector >::const_iterator i_ = + string_.begin(); + i_ != string_.end(); ++i_) { + if (i_->first.size() > col_) + col_ = i_->first.size(); + } + + return col_; +} +} Index: branches/apertium-tagger/apertium2/apertium/align.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/align.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/align.h (revision 69632) @@ -0,0 +1,35 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef ALIGN_H +#define ALIGN_H + +#include +#include +#include + +namespace Apertium { +class align { +public: + static void + align_(const std::vector > &string_); + +private: + static std::string::size_type + col(const std::vector > &string_); +}; +} + +#endif // ALIGN_H Index: branches/apertium-tagger/apertium2/apertium/analysis.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/analysis.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/analysis.cc (revision 69632) @@ -0,0 +1,55 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "analysis.h" + +#include "exception.h" +#include "morpheme.h" + +#include +#include + +namespace Apertium { +std::wostream &operator<<(std::wostream &Stream_, const Analysis &Analysis_) { + Stream_ << static_cast(Analysis_); + return Stream_; +} + +bool operator==(const Analysis &a, const Analysis &b) { + return a.TheMorphemes == b.TheMorphemes; +} + +bool operator<(const Analysis &a, const Analysis &b) { + return a.TheMorphemes < b.TheMorphemes; +} + +Analysis::operator std::wstring() const { + if (TheMorphemes.empty()) + throw Exception::Analysis::TheMorphemes_empty( + "can't convert Analysis comprising empty Morpheme std::vector to " + "std::wstring"); + + std::vector::const_iterator Morpheme_ = TheMorphemes.begin(); + std::wstring wstring_ = *Morpheme_; + ++Morpheme_; + + // Call .end() each iteration to save memory. + for (; Morpheme_ != TheMorphemes.end(); ++Morpheme_) { + wstring_ += L"+" + static_cast(*Morpheme_); + } + + return wstring_; +} +} Index: branches/apertium-tagger/apertium2/apertium/analysis.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/analysis.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/analysis.h (revision 69632) @@ -0,0 +1,37 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef ANALYSIS_H +#define ANALYSIS_H + +#include "morpheme.h" + +#include +#include +#include + +namespace Apertium { +class Analysis { +public: + friend std::wostream &operator<<(std::wostream &Stream_, + const Analysis &Analysis_); + friend bool operator==(const Analysis &a, const Analysis &b); + friend bool operator<(const Analysis &a, const Analysis &b); + operator std::wstring() const; + std::vector TheMorphemes; +}; +} + +#endif // ANALYSIS_H Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.cc (revision 69632) @@ -0,0 +1,20 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_5_3_1_tagger.h" + +namespace Apertium { +basic_5_3_1_Tagger::basic_5_3_1_Tagger() {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_1_tagger.h (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_5_3_1_TAGGER_H +#define BASIC_5_3_1_TAGGER_H + +#include "analysis.h" + +#include +#include + +namespace Apertium { +class basic_5_3_1_Tagger { +protected: + basic_5_3_1_Tagger(); + std::map Model; +}; +} + +#endif // BASIC_5_3_1_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.cc (revision 69632) @@ -0,0 +1,20 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_5_3_2_tagger.h" + +namespace Apertium { +basic_5_3_2_Tagger::basic_5_3_2_Tagger() {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_2_tagger.h (revision 69632) @@ -0,0 +1,33 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_5_3_2_TAGGER_H +#define BASIC_5_3_2_TAGGER_H + +#include "a.h" +#include "lemma.h" + +#include +#include + +namespace Apertium { +class basic_5_3_2_Tagger { +protected: + basic_5_3_2_Tagger(); + std::map > Model; +}; +} + +#endif // BASIC_5_3_2_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/basic_5_3_3_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_5_3_3_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_5_3_3_tagger.h (revision 69632) @@ -0,0 +1,35 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_5_3_3_TAGGER_H +#define BASIC_5_3_3_TAGGER_H + +#include "i.h" +#include "lemma.h" + +#include +#include +#include + +namespace Apertium { +class basic_5_3_3_Tagger { +protected: + std::pair >, + std::pair >, + std::map > > > Model; +}; +} + +#endif // BASIC_5_3_3_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/basic_exception_type.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_exception_type.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_exception_type.cc (revision 69632) @@ -0,0 +1,20 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_exception_type.h" + +namespace Apertium { +basic_ExceptionType::~basic_ExceptionType() throw() {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_exception_type.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_exception_type.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_exception_type.h (revision 69632) @@ -0,0 +1,29 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_EXCEPTION_TYPE_H +#define BASIC_EXCEPTION_TYPE_H + +#include + +namespace Apertium { +class basic_ExceptionType : public std::exception { +public: + virtual ~basic_ExceptionType() throw() = 0; + virtual const char *what() const throw() = 0; +}; +} + +#endif // BASIC_EXCEPTION_TYPE_H Index: branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.cc (revision 69632) @@ -0,0 +1,125 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_stream_tagger.h" + +#include "apertium_config.h" + +#include "basic_tagger.h" +#include "lexical_unit.h" +#include "stream.h" +#include "streamed_type.h" + +#include + +#if ENABLE_DEBUG + +#include +#include +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +basic_StreamTagger::~basic_StreamTagger() {} + +void basic_StreamTagger::tag(Stream &Input, std::wostream &Output) const { + while (true) { + StreamedType StreamedType_ = Input.get(); + Output << StreamedType_.TheString; + + if (!StreamedType_.TheLexicalUnit) { + if (!Input.flush_()) + break; + + Output << std::flush; + continue; + } + +#if ENABLE_DEBUG + + std::wcerr << L"\n\n"; + +#endif // ENABLE_DEBUG + + tag(*StreamedType_.TheLexicalUnit, Output); + + if (Input.flush_()) + Output << std::flush; + } +} + +basic_StreamTagger::basic_StreamTagger(const basic_Tagger::Flags &Flags_) + : basic_Tagger(Flags_) {} + +void basic_StreamTagger::tag(const LexicalUnit &LexicalUnit_, + std::wostream &Output) const { +#if ENABLE_DEBUG + + for (std::vector::const_iterator Analysis_ = + LexicalUnit_.TheAnalyses.begin(); + Analysis_ != LexicalUnit_.TheAnalyses.end(); ++Analysis_) { + std::wcerr << L"score(\"" << *Analysis_ << L"\") ==\n " + << score_DEBUG(*Analysis_) << L" ==\n " << std::fixed + << std::setprecision(std::numeric_limits::digits10) + << score(*Analysis_) << L"\n"; + } + +#endif // ENABLE_DEBUG + + Output << L"^"; + + if (LexicalUnit_.TheAnalyses.empty()) { + if (TheFlags.getShowSuperficial()) + Output << LexicalUnit_.TheSurfaceForm << L"/"; + + Output << L"*" << LexicalUnit_.TheSurfaceForm << L"$"; + return; + } + + if (TheFlags.getMark()) { + if (LexicalUnit_.TheAnalyses.size() != 1) + Output << L"="; + } + + if (TheFlags.getShowSuperficial()) + Output << LexicalUnit_.TheSurfaceForm << L"/"; + + std::vector::const_iterator TheAnalysis = + LexicalUnit_.TheAnalyses.begin(); + + for (std::vector::const_iterator Analysis_ = + LexicalUnit_.TheAnalyses.begin() + 1; + // Call .end() each iteration to save memory. + Analysis_ != LexicalUnit_.TheAnalyses.end(); ++Analysis_) { + if (score(*Analysis_) > score(*TheAnalysis)) + TheAnalysis = Analysis_; + } + + Output << *TheAnalysis; + + if (TheFlags.getFirst()) { + for (std::vector::const_iterator Analysis_ = + LexicalUnit_.TheAnalyses.begin(); + // Call .end() each iteration to save memory. + Analysis_ != LexicalUnit_.TheAnalyses.end(); ++Analysis_) { + if (Analysis_ != TheAnalysis) + Output << L"/" << *Analysis_; + } + } + + Output << L"$"; +} +} Index: branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_stream_tagger.h (revision 69632) @@ -0,0 +1,56 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_STREAM_TAGGER_H +#define BASIC_STREAM_TAGGER_H + +#include "apertium_config.h" + +#include "basic_tagger.h" +#include "lexical_unit.h" +#include "stream.h" + +#include +#include + +#if ENABLE_DEBUG + +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +class basic_StreamTagger : protected basic_Tagger { +public: + virtual ~basic_StreamTagger(); + virtual void deserialise(std::istream &Serialised_basic_Tagger) = 0; + void tag(Stream &Input, std::wostream &Output) const; + +protected: + basic_StreamTagger(const Flags &Flags_); + virtual long double score(const Analysis &Analysis_) const = 0; + +#if ENABLE_DEBUG + + virtual std::wstring score_DEBUG(const Analysis &Analysis_) const = 0; + +#endif // ENABLE_DEBUG + +private: + void tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) const; +}; +} + +#endif // BASIC_STREAM_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.cc (revision 69632) @@ -0,0 +1,59 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_stream_tagger_trainer.h" + +#include "analysis.h" +#include "basic_tagger.h" +#include "exception.h" +#include "stream.h" +#include "streamed_type.h" + +namespace Apertium { +basic_StreamTaggerTrainer::~basic_StreamTaggerTrainer() {} + +void basic_StreamTaggerTrainer::train(Stream &TaggedCorpus) { + while (true) { + StreamedType StreamedType_ = TaggedCorpus.get(); + + if (!StreamedType_.TheLexicalUnit) + break; + + if (StreamedType_.TheLexicalUnit->TheAnalyses.empty()) + throw Exception::LexicalUnit::TheAnalyses_empty( + "can't train LexicalUnit comprising empty Analysis std::vector"); + + if (OccurrenceCoefficient % + StreamedType_.TheLexicalUnit->TheAnalyses.size() != + 0) { + OccurrenceCoefficient *= StreamedType_.TheLexicalUnit->TheAnalyses.size(); + multiplyModel(StreamedType_.TheLexicalUnit->TheAnalyses.size()); + } + + for (std::vector::const_iterator Analysis_ = + StreamedType_.TheLexicalUnit->TheAnalyses.begin(); + Analysis_ != StreamedType_.TheLexicalUnit->TheAnalyses.end(); + ++Analysis_) { + train_Analysis(*Analysis_, + OccurrenceCoefficient / + StreamedType_.TheLexicalUnit->TheAnalyses.size()); + } + } +} + +basic_StreamTaggerTrainer::basic_StreamTaggerTrainer( + const basic_Tagger::Flags &Flags_) + : basic_Tagger(Flags_), OccurrenceCoefficient(1) {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_stream_tagger_trainer.h (revision 69632) @@ -0,0 +1,41 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_STREAM_TAGGER_TRAINER_H +#define BASIC_STREAM_TAGGER_TRAINER_H + +#include "basic_tagger.h" +#include "stream.h" + +#include + +namespace Apertium { +class basic_StreamTaggerTrainer : protected basic_Tagger { +public: + virtual ~basic_StreamTaggerTrainer(); + void train(Stream &TaggedCorpus); + virtual void serialise(std::ostream &Serialised_basic_Tagger) const = 0; + +protected: + basic_StreamTaggerTrainer(const Flags &Flags_); + virtual void train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_) = 0; + virtual void + multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier) = 0; + std::size_t OccurrenceCoefficient; +}; +} + +#endif // BASIC_STREAM_TAGGER_TRAINER_H Index: branches/apertium-tagger/apertium2/apertium/basic_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_tagger.cc (revision 69632) @@ -0,0 +1,48 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "basic_tagger.h" + +namespace Apertium { +basic_Tagger::Flags::Flags() + : Debug(false), First(false), Mark(false), ShowSuperficial(false), + NullFlush(false) {} + +bool basic_Tagger::Flags::getDebug() const { return Debug; } + +void basic_Tagger::Flags::setDebug(const bool &Debug_) { Debug = Debug_; } + +bool basic_Tagger::Flags::getFirst() const { return First; } + +void basic_Tagger::Flags::setFirst(const bool &First_) { First = First_; } + +bool basic_Tagger::Flags::getMark() const { return Mark; } + +void basic_Tagger::Flags::setMark(const bool &Mark_) { Mark = Mark_; } + +bool basic_Tagger::Flags::getShowSuperficial() const { return ShowSuperficial; } + +void basic_Tagger::Flags::setShowSuperficial(const bool &ShowSuperficial_) { + ShowSuperficial = ShowSuperficial_; +} + +bool basic_Tagger::Flags::getNullFlush() const { return NullFlush; } + +void basic_Tagger::Flags::setNullFlush(const bool &NullFlush_) { + NullFlush = NullFlush_; +} + +basic_Tagger::basic_Tagger(const Flags &Flags_) : TheFlags(Flags_) {} +} Index: branches/apertium-tagger/apertium2/apertium/basic_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/basic_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/basic_tagger.h (revision 69632) @@ -0,0 +1,60 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef BASIC_TAGGER_H +#define BASIC_TAGGER_H + +namespace Apertium { +class basic_Tagger { +public: + class Flags { + public: + Flags(); + bool getDebug() const; + void setDebug(const bool &Debug_); + bool getFirst() const; + void setFirst(const bool &First_); + bool getMark() const; + void setMark(const bool &Mark_); + bool getShowSuperficial() const; + void setShowSuperficial(const bool &ShowSuperficial_); + bool getNullFlush() const; + void setNullFlush(const bool &NullFlush_); + static bool (Flags::*GetDebug)() const; + static void (Flags::*SetDebug)(const bool &); + static bool (Flags::*GetFirst)() const; + static void (Flags::*SetFirst)(const bool &); + static bool (Flags::*GetMark)() const; + static void (Flags::*SetMark)(const bool &); + static bool (Flags::*GetShowSuperficial)() const; + static void (Flags::*SetShowSuperficial)(const bool &); + static bool (Flags::*GetNullFlush)() const; + static void (Flags::*SetNullFlush)(const bool &); + + private: + bool Debug : 1; + bool First : 1; + bool Mark : 1; + bool ShowSuperficial : 1; + bool NullFlush : 1; + }; + +protected: + basic_Tagger(const Flags &Flags_); + Flags TheFlags; +}; +} + +#endif // BASIC_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/constructor_eq_delete.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/constructor_eq_delete.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/constructor_eq_delete.h (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef CONSTRUCTOR_EQ_DELETE_H +#define CONSTRUCTOR_EQ_DELETE_H + +namespace Apertium { +class constructor_eq_delete { +protected: + constructor_eq_delete() {} + ~constructor_eq_delete() {} + +private: + constructor_eq_delete(const constructor_eq_delete &constructor_eq_delete_); + constructor_eq_delete & + operator=(constructor_eq_delete constructor_eq_delete_); +}; +} + +#endif // CONSTRUCTOR_EQ_DELETE_H Index: branches/apertium-tagger/apertium2/apertium/deserialiser.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/deserialiser.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/deserialiser.h (revision 69632) @@ -0,0 +1,255 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef DESERIALISER_H +#define DESERIALISER_H + +#include "a.h" +#include "analysis.h" +#include "basic_exception_type.h" +#include "exception.h" +#include "i.h" +#include "lemma.h" +#include "morpheme.h" +#include "tag.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +template class Deserialiser; + +template <> class Deserialiser { +public: + inline static a deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static Analysis deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static i deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static Lemma deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static Morpheme deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static Tag deserialise(std::istream &Stream_); +}; + +template +class Deserialiser > { +public: + inline static std::basic_string + deserialise(std::istream &Stream_); +}; + +template +class Deserialiser > { +public: + inline static std::map + deserialise(std::istream &Stream_); +}; + +template +class Deserialiser > { +public: + inline static std::pair + deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static std::size_t deserialise(std::istream &Stream_); +}; + +template class Deserialiser > { +public: + inline static std::vector deserialise(std::istream &Stream_); +}; + +template <> class Deserialiser { +public: + inline static wchar_t deserialise(std::istream &Stream_); +}; + +a Deserialiser::deserialise(std::istream &Stream_) { + a StreamedType_; + StreamedType_.TheTags = Deserialiser >::deserialise(Stream_); + StreamedType_.TheMorphemes = + Deserialiser >::deserialise(Stream_); + return StreamedType_; +} + +Analysis Deserialiser::deserialise(std::istream &Stream_) { + Analysis SerialisedType_; + SerialisedType_.TheMorphemes = + Deserialiser >::deserialise(Stream_); + return SerialisedType_; +} + +i Deserialiser::deserialise(std::istream &Stream_) { + i StreamedType_; + StreamedType_.TheTags = Deserialiser >::deserialise(Stream_); + return StreamedType_; +} + +Lemma Deserialiser::deserialise(std::istream &Stream_) { + Lemma StreamedType_; + StreamedType_.TheLemma = Deserialiser::deserialise(Stream_); + return StreamedType_; +} + +Morpheme Deserialiser::deserialise(std::istream &Stream_) { + Morpheme SerialisedType_; + SerialisedType_.TheLemma = Deserialiser::deserialise(Stream_); + SerialisedType_.TheTags = + Deserialiser >::deserialise(Stream_); + return SerialisedType_; +} + +Tag Deserialiser::deserialise(std::istream &Stream_) { + Tag SerialisedType_; + SerialisedType_.TheTag = Deserialiser::deserialise(Stream_); + return SerialisedType_; +} + +template +std::basic_string +Deserialiser >::deserialise( + std::istream &Stream_) { + std::size_t SerialisedValueCount = + Deserialiser::deserialise(Stream_); + std::basic_string SerialisedType_; + + for (; SerialisedValueCount != 0; --SerialisedValueCount) { + SerialisedType_.push_back(Deserialiser::deserialise(Stream_)); + } + + return SerialisedType_; +} + +template +std::map +Deserialiser >::deserialise( + std::istream &Stream_) { + std::size_t SerialisedValueCount = + Deserialiser::deserialise(Stream_); + std::map SerialisedType_; + + for (; SerialisedValueCount != 0; --SerialisedValueCount) { + SerialisedType_.insert( + Deserialiser >::deserialise(Stream_)); + } + + return SerialisedType_; +} + +template +std::pair +Deserialiser >::deserialise( + std::istream &Stream_) { + std::pair SerialisedType_; + SerialisedType_.first = Deserialiser::deserialise(Stream_); + SerialisedType_.second = Deserialiser::deserialise(Stream_); + return SerialisedType_; +} + +std::size_t Deserialiser::deserialise(std::istream &Stream_) { + try { + std::size_t SerialisedType_ = 0; + unsigned char SerialisedTypeSize = Stream_.get(); + + if (!Stream_) + throw Exception::Deserialiser::not_Stream_good("can't deserialise size"); + + for (; SerialisedTypeSize != 0;) { + SerialisedType_ += + static_cast(Stream_.get()) + << std::numeric_limits::digits * --SerialisedTypeSize; + + if (!Stream_) + throw Exception::Deserialiser::not_Stream_good( + "can't deserialise byte"); + } + + return SerialisedType_; + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't deserialise std::size_t: " << basic_ExceptionType_.what(); + throw Exception::Deserialiser::size_t_(what_); + } +} + +template +std::vector +Deserialiser >::deserialise(std::istream &Stream_) { + std::size_t SerialisedValueCount = + Deserialiser::deserialise(Stream_); + std::vector SerialisedType_; + + for (; SerialisedValueCount != 0; --SerialisedValueCount) { + SerialisedType_.push_back(Deserialiser::deserialise(Stream_)); + } + + return SerialisedType_; +} + +wchar_t Deserialiser::deserialise(std::istream &Stream_) { + try { + unsigned wchar_t SerialisedType_ = 0; + unsigned char SerialisedTypeSize = Stream_.get(); + + if (!Stream_) + throw Exception::Deserialiser::not_Stream_good("can't deserialise size"); + + for (; SerialisedTypeSize != 0;) { + SerialisedType_ += + static_cast(Stream_.get()) + << std::numeric_limits::digits * --SerialisedTypeSize; + + if (!Stream_) + throw Exception::Deserialiser::not_Stream_good( + "can't deserialise byte"); + } + + return static_cast(SerialisedType_); + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't deserialise wchar_t: " << basic_ExceptionType_.what(); + throw Exception::Deserialiser::wchar_t_(what_); + } +} +} + +#endif // DESERIALISER_H Index: branches/apertium-tagger/apertium2/apertium/err_exception.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/err_exception.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/err_exception.h (revision 69632) @@ -0,0 +1,23 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef ERR_EXCEPTION_H +#define ERR_EXCEPTION_H + +namespace Apertium { +class err_Exception {}; +} + +#endif // ERR_EXCEPTION_H Index: branches/apertium-tagger/apertium2/apertium/exception_type.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/exception_type.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/exception_type.cc (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "exception_type.h" + +#include +#include + +namespace Apertium { +ExceptionType::ExceptionType(const char *const what_) : what_(what_) {} + +ExceptionType::ExceptionType(const std::string &what_) : what_(what_) {} + +ExceptionType::ExceptionType(const std::stringstream &what_) + : what_(what_.str()) {} + +ExceptionType::~ExceptionType() throw() {} + +const char *ExceptionType::what() const throw() { return what_.c_str(); } +} Index: branches/apertium-tagger/apertium2/apertium/exception_type.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/exception_type.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/exception_type.h (revision 69632) @@ -0,0 +1,38 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef EXCEPTION_TYPE_H +#define EXCEPTION_TYPE_H + +#include "basic_exception_type.h" + +#include +#include + +namespace Apertium { +class ExceptionType : public basic_ExceptionType { +public: + ExceptionType(const char *const what_); + ExceptionType(const std::string &what_); + ExceptionType(const std::stringstream &what_); + virtual ~ExceptionType() throw() = 0; + const char *what() const throw(); + +protected: + const std::string what_; +}; +} + +#endif // EXCEPTION_TYPE_H Index: branches/apertium-tagger/apertium2/apertium/file_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/file_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/file_tagger.cc (revision 69632) @@ -0,0 +1,42 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "file_tagger.h" + +#include + +#include + +namespace Apertium { +FILE_Tagger::FILE_Tagger() : debug(false), show_sf(false), null_flush(false) {} + +FILE_Tagger::~FILE_Tagger() {} + +void FILE_Tagger::set_debug(const bool &Debug) { debug = Debug; } + +void FILE_Tagger::set_show_sf(const bool &ShowSuperficial) { + show_sf = ShowSuperficial; +} + +void FILE_Tagger::setNullFlush(const bool &NullFlush) { + null_flush = NullFlush; +} + +void FILE_Tagger::deserialise(char *const TaggerSpecificationFilename) { + TSXReader TaggerSpecificationReader_; + TaggerSpecificationReader_.read(TaggerSpecificationFilename); + deserialise(TaggerSpecificationReader_.getTaggerData()); +} +} Index: branches/apertium-tagger/apertium2/apertium/i.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/i.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/i.cc (revision 69632) @@ -0,0 +1,50 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "i.h" + +#include "analysis.h" +#include "exception.h" +#include "morpheme.h" + +namespace Apertium { +bool operator==(const i &a_, const i &b_) { return a_.TheTags == b_.TheTags; } + +bool operator<(const i &a_, const i &b_) { return a_.TheTags < b_.TheTags; } + +i::i() {} + +i::i(const Analysis &Analysis_) : TheTags() { + if (Analysis_.TheMorphemes.empty()) + throw Exception::Analysis::TheMorphemes_empty("can't convert const " + "Analysis & comprising empty " + "Morpheme std::vector to i"); + + if (Analysis_.TheMorphemes.front().TheTags.empty()) + throw Exception::Morpheme::TheTags_empty("can't convert const Analysis & " + "comprising Morpheme comprising " + "empty Tag std::vector to i"); + + TheTags = Analysis_.TheMorphemes.front().TheTags; +} + +i::i(const Morpheme &Morpheme_) : TheTags() { + if (Morpheme_.TheTags.empty()) + throw Exception::Morpheme::TheTags_empty( + "can't convert const Morpheme & comprising empty Tag std::vector to i"); + + TheTags = Morpheme_.TheTags; +} +} Index: branches/apertium-tagger/apertium2/apertium/i.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/i.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/i.h (revision 69632) @@ -0,0 +1,38 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef I_H +#define I_H + +#include "analysis.h" +#include "morpheme.h" +#include "tag.h" + +#include + +namespace Apertium { +class i { + friend bool operator==(const i &a_, const i &b_); + friend bool operator<(const i &a_, const i &b_); + +public: + i(); + i(const Analysis &Analysis_); + i(const Morpheme &Morpheme_); + std::vector TheTags; +}; +} + +#endif // I_H Index: branches/apertium-tagger/apertium2/apertium/lemma.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/lemma.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lemma.cc (revision 69632) @@ -0,0 +1,55 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "lemma.h" + +#include "analysis.h" +#include "exception.h" +#include "morpheme.h" + +namespace Apertium { +bool operator==(const Lemma &a_, const Lemma &b_) { + return a_.TheLemma == b_.TheLemma; +} + +bool operator<(const Lemma &a_, const Lemma &b_) { + return a_.TheLemma < b_.TheLemma; +} + +Lemma::Lemma() : TheLemma() {} + +Lemma::Lemma(const Analysis &Analysis_) : TheLemma() { + if (Analysis_.TheMorphemes.empty()) + throw Exception::Analysis::TheMorphemes_empty( + "can't convert const Analysis & comprising empty Morpheme std::vector " + "to Lemma"); + + if (Analysis_.TheMorphemes.front().TheLemma.empty()) + throw Exception::Morpheme::TheLemma_empty( + "can't convert const Analysis & comprising Morpheme comprising empty " + "Lemma std::wstring to Lemma"); + + TheLemma = Analysis_.TheMorphemes.front().TheLemma; +} + +Lemma::Lemma(const Morpheme &Morpheme_) : TheLemma() { + if (Morpheme_.TheLemma.empty()) + throw Exception::Morpheme::TheLemma_empty("can't convert const Morpheme & " + "comprising empty Lemma " + "std::wstring to Lemma"); + + TheLemma = Morpheme_.TheLemma; +} +} Index: branches/apertium-tagger/apertium2/apertium/lemma.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/lemma.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lemma.h (revision 69632) @@ -0,0 +1,36 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef LEMMA_H +#define LEMMA_H + +#include "analysis.h" +#include "morpheme.h" + +#include + +namespace Apertium { +class Lemma { +public: + friend bool operator==(const Lemma &a_, const Lemma &b_); + friend bool operator<(const Lemma &a_, const Lemma &b_); + Lemma(); + Lemma(const Analysis &Analysis_); + Lemma(const Morpheme &Morpheme_); + std::wstring TheLemma; +}; +} + +#endif // LEMMA_H Index: branches/apertium-tagger/apertium2/apertium/lexical_unit.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/lexical_unit.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lexical_unit.h (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef TAGGING_EXPRESSION_H +#define TAGGING_EXPRESSION_H + +#include "analysis.h" + +#include +#include + +namespace Apertium { +class LexicalUnit { +public: + std::wstring TheSurfaceForm; + std::vector TheAnalyses; +}; +} + +#endif // LEXICAL_UNIT_H Index: branches/apertium-tagger/apertium2/apertium/linebreak.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/linebreak.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/linebreak.cc (revision 69632) @@ -0,0 +1,94 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "linebreak.h" + +#include + +namespace Apertium { +std::string linebreak::linebreak_(std::string string_, + std::string::size_type col, + const std::string::size_type &wrapmargin) { + std::string::size_type i_ = 0; + + while (true) { + if (i_ == string_.size()) + return string_; + + if (col < 79) { + if (string_.at(i_) == '\n') { + if (i_ + 1 == string_.size()) { + string_.erase(i_, 1); + return string_; + } + + string_.insert(i_ + 1, wrapmargin, ' '); + col = wrapmargin; + i_ += wrapmargin; + continue; + } + + ++col; + ++i_; + continue; + } + + if (string_.at(i_) == ' ') { + std::string::size_type j_ = i_ + 1; + + for (; i_ != 0; --i_) { + if (string_.at(i_ - 1) != ' ') + break; + } + + for (;; ++j_) { + if (j_ == string_.size()) { + string_.erase(i_, j_ - i_); + return string_; + } + + if (string_.at(j_) != ' ') + break; + } + + linebreak_(string_, col, wrapmargin, i_, j_); + continue; + } + + std::string::size_type j_ = i_; + + for (; j_ != 0; --j_) { + if (string_.at(j_ - 1) == ' ') + break; + } + + for (i_ = j_; i_ != 0; --i_) { + if (string_.at(i_ - 1) != ' ') + break; + } + + linebreak_(string_, col, wrapmargin, i_, j_); + } +} + +void linebreak::linebreak_(std::string &string_, std::string::size_type &col, + const std::string::size_type &wrapmargin, + std::string::size_type &i_, + const std::string::size_type &j_) { + string_.replace(i_, j_ - i_, '\n' + std::string(wrapmargin, ' ')); + col = wrapmargin; + i_ += 1 /* '\n' */ + wrapmargin /* std::string(wrapmargin, ' ') */; +} +} Index: branches/apertium-tagger/apertium2/apertium/linebreak.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/linebreak.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/linebreak.h (revision 69632) @@ -0,0 +1,36 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef LINEBREAK_H +#define LINEBREAK_H + +#include + +namespace Apertium { +class linebreak { +public: + static std::string linebreak_(std::string string_, + std::string::size_type col, + const std::string::size_type &wrapmargin); + +private: + static void linebreak_(std::string &string_, std::string::size_type &col, + const std::string::size_type &wrapmargin, + std::string::size_type &i_, + const std::string::size_type &j_); +}; +} + +#endif // LINEBREAK_H Index: branches/apertium-tagger/apertium2/apertium/morpheme.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/morpheme.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/morpheme.cc (revision 69632) @@ -0,0 +1,57 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "morpheme.h" + +#include "exception.h" +#include "tag.h" + +#include +#include + +namespace Apertium { +bool operator==(const Morpheme &a, const Morpheme &b) { + return a.TheLemma == b.TheLemma && a.TheTags == b.TheTags; +} + +bool operator<(const Morpheme &a, const Morpheme &b) { + if (a.TheLemma != b.TheLemma) + return a.TheLemma < b.TheLemma; + + return a.TheTags < b.TheTags; +} + +Morpheme::operator std::wstring() const { + if (TheTags.empty()) + throw Exception::Morpheme::TheTags_empty("can't convert Morpheme " + "comprising empty Tag std::vector " + "to std::wstring"); + + if (TheLemma.empty()) + throw Exception::Morpheme::TheLemma_empty("can't convert Morpheme " + "comprising empty TheLemma " + "std::wstring to std::wstring"); + + std::wstring wstring_ = TheLemma; + + for (std::vector::const_iterator Tag_ = TheTags.begin(); + // Call .end() each iteration to save memory. + Tag_ != TheTags.end(); ++Tag_) { + wstring_ += static_cast(*Tag_); + } + + return wstring_; +} +} Index: branches/apertium-tagger/apertium2/apertium/morpheme.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/morpheme.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/morpheme.h (revision 69632) @@ -0,0 +1,35 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef MORPHEME_H +#define MORPHEME_H + +#include "tag.h" + +#include +#include + +namespace Apertium { +class Morpheme { +public: + friend bool operator==(const Morpheme &a, const Morpheme &b); + friend bool operator<(const Morpheme &a, const Morpheme &b); + operator std::wstring() const; + std::wstring TheLemma; + std::vector TheTags; +}; +} + +#endif // MORPHEME_H Index: branches/apertium-tagger/apertium2/apertium/optional.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/optional.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/optional.h (revision 69632) @@ -0,0 +1,123 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef OPTIONAL_H +#define OPTIONAL_H + +#include "exception.h" + +#include +#include +#include +#include + +namespace Apertium { +template class Optional; + +template +void swap(Optional &A, Optional &B); + +template class Optional { +public: + friend void swap(Optional &A, Optional &B); + Optional(); + Optional(const OptionalType &OptionalType_); + Optional(const Optional &Optional_); + Optional &operator=(Optional Optional_); + ~Optional(); + const OptionalType &operator*() const; + OptionalType &operator*(); + const OptionalType *operator->() const; + OptionalType *operator->(); + operator bool() const; + +private: + OptionalType *TheOptionalTypePointer; +}; + +template +void swap(Optional &A, Optional &B) { + using std::swap; + swap(A.TheOptionalTypePointer, B.TheOptionalTypePointer); +} + +template +Optional::Optional() + : TheOptionalTypePointer(NULL) {} + +template +Optional::Optional(const OptionalType &OptionalType_) + : TheOptionalTypePointer(new OptionalType(OptionalType_)) {} + +template +Optional::Optional(const Optional &Optional_) { + if (Optional_.TheOptionalTypePointer == NULL) { + TheOptionalTypePointer = NULL; + return; + } + + TheOptionalTypePointer = + new OptionalType(*(Optional_.TheOptionalTypePointer)); +} + +template +Optional &Optional::operator=(Optional Optional_) { + swap(*this, Optional_); + return *this; +} + +template Optional::~Optional() { + if (TheOptionalTypePointer == NULL) + return; + + delete TheOptionalTypePointer; +} + +template +const OptionalType &Optional::operator*() const { + if (TheOptionalTypePointer == NULL) + throw Exception::Optional::TheOptionalTypePointer_null( + "can't dereference Optional comprising null OptionalType pointer"); + + return *TheOptionalTypePointer; +} + +template +OptionalType &Optional::operator*() { + return const_cast( + static_cast(*this).operator*()); +} + +template +const OptionalType *Optional::operator->() const { + if (TheOptionalTypePointer == NULL) + throw Exception::Optional::TheOptionalTypePointer_null( + "can't dereference Optional comprising null OptionalType pointer"); + + return TheOptionalTypePointer; +} + +template +OptionalType *Optional::operator->() { + return const_cast( + static_cast(*this).operator->()); +} + +template Optional::operator bool() const { + return TheOptionalTypePointer != NULL; +} +} + +#endif Index: branches/apertium-tagger/apertium2/apertium/stream.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream.cc (revision 69632) @@ -0,0 +1,774 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream.h" + +#include "analysis.h" +#include "basic_tagger.h" +#include "streamed_type.h" +#include "wchar_t_exception.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +Stream::Stream(const basic_Tagger::Flags &Flags_) + : TheCharacterStream(std::wcin), TheFilename(), TheLineNumber(1), TheLine(), + TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} + +Stream::Stream(const basic_Tagger::Flags &Flags_, + std::wifstream &CharacterStream_, const char *const Filename_) + : TheCharacterStream(CharacterStream_), TheFilename(Filename_), + TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false), + ThePreviousCase() {} + +Stream::Stream(const basic_Tagger::Flags &Flags_, + std::wifstream &CharacterStream_, const std::string &Filename_) + : TheCharacterStream(CharacterStream_), TheFilename(Filename_), + TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false), + ThePreviousCase() {} + +Stream::Stream(const basic_Tagger::Flags &Flags_, + std::wifstream &CharacterStream_, + const std::stringstream &Filename_) + : TheCharacterStream(CharacterStream_), TheFilename(Filename_.str()), + TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false), + ThePreviousCase() {} + +StreamedType Stream::get() { + StreamedType TheStreamedType; + std::wstring Lemma; + private_flush_ = false; + + if (!is_eof_throw_if_not_TheCharacterStream_good()) { + while (true) { + const wchar_t Character_ = TheCharacterStream.get(); + + if (is_eof_throw_if_not_TheCharacterStream_good(TheStreamedType, Lemma, + Character_)) + break; + + TheLine.push_back(Character_); + + switch (Character_) { + case L'\\': // <\> 92, Hex 5c, Octal 134 + case_0x5c(TheStreamedType, Lemma, Character_); + continue; + case L'[': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '[' expected to follow ']' or '$'"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + push_back_Character(TheStreamedType, Lemma, Character_); + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L']': + if (!ThePreviousCase) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"', ']' expected to follow '['"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + ThePreviousCase = PreviousCaseType(Character_); + continue; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', ']' expected to follow '['"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + std::abort(); + case L'^': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '^' expected to follow '[', ']', or '$'"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + TheStreamedType.TheLexicalUnit = LexicalUnit(); + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L'/': + if (!ThePreviousCase) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"', '/' expected to follow '[', to follow '>' immediately, " + L"or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'^': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '/' expected to follow '[', to follow '>' " + L"immediately, or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + ThePreviousCase = PreviousCaseType(Character_); + + { + const wchar_t Character_ = TheCharacterStream.get(); + + if (is_eof_throw_if_not_TheCharacterStream_good( + TheStreamedType, Lemma, Character_)) { + std::wstringstream Message; + Message << L"unexpected end-of-file following '" + << ThePreviousCase->ThePreviousCase + << "', end-of-file expected to follow ']' or '$'"; + throw wchar_t_Exception::Stream::UnexpectedEndOfFile( + Message_what(Message)); + } + + TheLine.push_back(Character_); + + switch (Character_) { + case L'\\': + TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.push_back(Morpheme()); + case_0x5c(TheStreamedType, Lemma, Character_); + continue; + case L'*': + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L'\n': { + std::wstringstream Message; + Message << L"unexpected newline following '" + << ThePreviousCase->ThePreviousCase + << "', newline expected to follow '[', ']', or '$'"; + throw wchar_t_Exception::Stream::UnexpectedCharacter( + Message_what(Message)); + }; + case L'[': + case L']': + case L'^': + case L'#': + case L'<': + case L'>': + case L'+': + case L'$': { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase << L"', expected '*'"; + throw wchar_t_Exception::Stream::UnexpectedPreviousCase( + Message_what(Message)); + } + default: + TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.push_back(Morpheme()); + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + } + } + + continue; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '/' expected to follow '[', to follow '>' " + L"immediately, or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'#': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '/' expected to follow '[', to follow '>' " + L"immediately, or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '/' expected to follow '[', to follow '>' " + L"immediately, or to follow '^' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis()); + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.push_back(Morpheme()); + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L'*': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '*' expected to follow '[', ']', or '$' or to follow " + L"'/' immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'<': + if (!ThePreviousCase) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"', '<' expected to follow '[', to follow '>' immediately, " + L"or to follow '#', '/' or '+' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'/': + case L'#': + case L'+': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '<' expected to follow '[', to follow '>' immediately, " + L"or to follow '#', '/' or '+' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '<' expected to follow '[', to follow '>' immediately, " + L"or to follow '#', '/' or '+' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '<' expected to follow '[', to follow '>' immediately, " + L"or to follow '#', '/' or '+' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheTags.push_back(Tag()); + ThePreviousCase = PreviousCaseType(Character_); + continue; + case L'>': + if (!ThePreviousCase) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"', '>' expected to " + L"follow '[' or to follow " + L"'<' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'<': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '>' expected to " + L"follow '[' or to follow " + L"'<' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + ThePreviousCase = PreviousCaseType(Character_); + continue; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '>' expected to " + L"follow '[' or to follow " + L"'<' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + std::abort(); + case L'#': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + case L']': + case L'$': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'/': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '#' expected to follow '[', ']', or '$', to follow " + L"'>' immediately, or to follow '/' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '#' expected to follow '[', ']', or '$', to follow " + L"'>' immediately, or to follow '/' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '#' expected to follow '[', ']', or '$', to follow " + L"'>' immediately, or to follow '/' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + ThePreviousCase = PreviousCaseType(Character_); + continue; + } + + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'+': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + case L']': + case L'$': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '+' expected to follow '[', ']', or '$', to follow " + L"'>' " + L"immediately, or to follow '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'#': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '+' expected to follow '[', ']', or '$', to follow " + L"'>' " + L"immediately, or to follow '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: { + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '+' expected to follow '[', ']', or '$', to follow " + L"'>' immediately, or to follow '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + TheStreamedType.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.push_back(Morpheme()); + ThePreviousCase = PreviousCaseType(Character_); + continue; + } + + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'$': + if (!ThePreviousCase) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + case L'*': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + if (TheFlags.getDebug()) { + if (Lemma != TheStreamedType.TheLexicalUnit->TheSurfaceForm) + std::wcerr << L"unexpected lemma \"" << Lemma + << L"\", expected \"" + << TheStreamedType.TheLexicalUnit->TheSurfaceForm + << L"\"\n"; + } + + ThePreviousCase = PreviousCaseType(Character_); + return TheStreamedType; + case L'>': + if (!ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ + << L"' not immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + case L'#': + if (ThePreviousCase->isPreviousCharacter) { + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' immediately following '" + << ThePreviousCase->ThePreviousCase + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + break; + default: + std::wstringstream Message; + Message + << L"unexpected '" << Character_ << L"' following '" + << ThePreviousCase->ThePreviousCase + << L"', '$' expected to follow '[', to follow '>' immediately, " + L"or to follow '*' or '#' not immediately"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + + ThePreviousCase = PreviousCaseType(Character_); + return TheStreamedType; + case L'\n': + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message << L"unexpected newline following '" + << ThePreviousCase->ThePreviousCase + << L"', newline expected to follow '[', ']', or '$'"; + throw wchar_t_Exception::Stream::UnexpectedCase( + Message_what(Message)); + } + } + + push_back_Character(TheStreamedType, Lemma, Character_); + ++TheLineNumber; + TheLine.clear(); + continue; + default: + push_back_Character(TheStreamedType, Lemma, Character_); + continue; + } + + std::abort(); + } + } + + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L']': + case L'$': + break; + default: + std::wstringstream Message; + Message << L"unexpected end-of-file following '" + << ThePreviousCase->ThePreviousCase + << L"', end-of-file expected to follow ']' " + L"or '$'"; + throw wchar_t_Exception::Stream::UnexpectedEndOfFile( + Message_what(Message)); + } + } + + return TheStreamedType; +} + +bool Stream::flush_() const { return private_flush_; } + +Stream::PreviousCaseType::PreviousCaseType(const wchar_t &PreviousCase_) + : ThePreviousCase(PreviousCase_), isPreviousCharacter(true) {} + +bool Stream::is_eof_throw_if_not_TheCharacterStream_good() const { + if (TheCharacterStream.eof()) + return true; + + if (!TheCharacterStream) { + std::wstringstream Message; + Message << L"can't get const wchar_t: TheCharacterStream not good"; + throw wchar_t_Exception::Stream::TheCharacterStream_not_good( + Message_what(Message)); + } + + return false; +} + +std::wstring Stream::Message_what(const std::wstringstream &Message) const { + std::wstringstream what_; + + if (TheFilename) + what_ << std::wstring(TheFilename->begin(), TheFilename->end()) << L": "; + + what_ << TheLineNumber << L":" << TheLine.size() << L": " << Message.str() + << L'\n' << TheLine << L'\n' << std::wstring(TheLine.size() - 1, L' ') + << L'^'; + return what_.str(); +} + +bool +Stream::is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, + std::wstring &Lemma, + const wchar_t &Character_) { + if (isTheCharacterStream_eof(StreamedType_, Lemma, Character_)) + return true; + + if (!TheCharacterStream) { + std::wstringstream Message; + Message << L"can't get const wchar_t: TheCharacterStream not good"; + throw wchar_t_Exception::Stream::TheCharacterStream_not_good( + Message_what(Message)); + } + + return false; +} + +bool Stream::isTheCharacterStream_eof(StreamedType &StreamedType_, + std::wstring &Lemma, + const wchar_t &Character_) { + if (TheCharacterStream.eof()) + return true; + + if (TheFlags.getNullFlush()) { + if (Character_ == L'\0') { + push_back_Character(StreamedType_, Lemma, Character_); + private_flush_ = true; + return true; + } + } + + return false; +} + +void Stream::push_back_Character(StreamedType &StreamedType_, + std::wstring &Lemma, + const wchar_t &Character_) { + if (ThePreviousCase) { + switch (ThePreviousCase->ThePreviousCase) { + case L'[': + StreamedType_.TheString += Character_; + break; + case L']': + StreamedType_.TheString += Character_; + break; + case L'^': + StreamedType_.TheLexicalUnit->TheSurfaceForm += Character_; + break; + case L'/': + StreamedType_.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheLemma.push_back(Character_); + break; + case L'*': + Lemma += Character_; + break; + case L'<': + StreamedType_.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheTags.back() + .TheTag += Character_; + break; + case L'>': { + std::wstringstream Message; + Message << L"unexpected '" << Character_ << L"' immediately following '" + << ThePreviousCase->ThePreviousCase << L"'"; + throw wchar_t_Exception::Stream::UnexpectedCharacter( + Message_what(Message)); + } + case L'#': + StreamedType_.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheLemma.push_back(Character_); + break; + case L'+': + StreamedType_.TheLexicalUnit->TheAnalyses.back() + .TheMorphemes.back() + .TheLemma.push_back(Character_); + break; + case L'$': + StreamedType_.TheString += Character_; + break; + default: + std::wstringstream Message; + Message << L"unexpected previous reserved or special character '" + << ThePreviousCase->ThePreviousCase << L"'"; + throw wchar_t_Exception::Stream::UnexpectedPreviousCase( + Message_what(Message)); + } + + ThePreviousCase->isPreviousCharacter = false; + return; + } + + StreamedType_.TheString += Character_; +} + +void Stream::case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma, + const wchar_t &Character_) { + push_back_Character(StreamedType_, Lemma, Character_); + + { + const wchar_t Character_ = TheCharacterStream.get(); + + if (is_eof_throw_if_not_TheCharacterStream_good(StreamedType_, Lemma, + Character_)) { + std::wstringstream Message; + Message << L"unexpected end-of-file following '\\', end-of-file " + L"expected to follow ']' or '$'"; + throw wchar_t_Exception::Stream::UnexpectedEndOfFile( + Message_what(Message)); + } + + TheLine.push_back(Character_); + push_back_Character(StreamedType_, Lemma, Character_); + } +} +} Index: branches/apertium-tagger/apertium2/apertium/stream.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream.h (revision 69632) @@ -0,0 +1,69 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_H +#define STREAM_H + +#include "basic_tagger.h" +#include "optional.h" +#include "streamed_type.h" + +#include +#include +#include +#include + +namespace Apertium { +class Stream { +public: + Stream(const basic_Tagger::Flags &Flags_); + Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, + const char *const Filename_); + Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, + const std::string &Filename_); + Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, + const std::stringstream &Filename_); + StreamedType get(); + bool flush_() const; + +private: + class PreviousCaseType { + public: + PreviousCaseType(const wchar_t &PreviousCase_); + wchar_t ThePreviousCase; + bool isPreviousCharacter : 1; + }; + bool is_eof_throw_if_not_TheCharacterStream_good() const; + std::wstring Message_what(const std::wstringstream &Message) const; + bool is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_, + std::wstring &Lemma, + const wchar_t &Character_); + bool isTheCharacterStream_eof(StreamedType &StreamedType_, + std::wstring &Lemma, const wchar_t &Character_); + void push_back_Character(StreamedType &StreamedType_, std::wstring &Lemma, + const wchar_t &Character_); + void case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma, + const wchar_t &Character_); + std::wistream &TheCharacterStream; + Optional TheFilename; + std::size_t TheLineNumber; + std::wstring TheLine; + const basic_Tagger::Flags &TheFlags; + bool private_flush_ : 1; + Optional ThePreviousCase; +}; +} + +#endif // STREAM_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.cc (revision 69632) @@ -0,0 +1,68 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_1_tagger.h" + +#include "apertium_config.h" + +#include "analysis.h" +#include "deserialiser.h" +#include "lexical_unit.h" +#include "stream.h" +#include "streamed_type.h" + +#include +#include +#include +#include + +#if ENABLE_DEBUG + +#include +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +Stream_5_3_1_Tagger::Stream_5_3_1_Tagger(const Flags &Flags_) + : basic_5_3_1_Tagger(), basic_StreamTagger(Flags_) {} + +void Stream_5_3_1_Tagger::deserialise(std::istream &Serialised_basic_Tagger) { + Model = Deserialiser >::deserialise( + Serialised_basic_Tagger); +} + +long double Stream_5_3_1_Tagger::score(const Analysis &Analysis_) const { + return tokenCount_T(Analysis_); +} + +long double Stream_5_3_1_Tagger::tokenCount_T(const Analysis &Analysis_) const { + if (Model.find(Analysis_) == Model.end()) + return 1; + + return 1 + Model.find(Analysis_)->second; +} + +#if ENABLE_DEBUG + +std::wstring Stream_5_3_1_Tagger::score_DEBUG(const Analysis &Analysis_) const { + std::wstringstream score_DEBUG_; + score_DEBUG_ << tokenCount_T(Analysis_); + return score_DEBUG_.str(); +} + +#endif // ENABLE_DEBUG + +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger.h (revision 69632) @@ -0,0 +1,53 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_1_TAGGER_H +#define STREAM_5_3_1_TAGGER_H + +#include "apertium_config.h" + +#include "analysis.h" +#include "basic_5_3_1_tagger.h" +#include "basic_stream_tagger.h" + +#include + +#if ENABLE_DEBUG + +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +class Stream_5_3_1_Tagger : private basic_5_3_1_Tagger, + public basic_StreamTagger { +public: + Stream_5_3_1_Tagger(const Flags &Flags_); + void deserialise(std::istream &Serialised_basic_Tagger); + +private: + long double score(const Analysis &Analysis_) const; + long double tokenCount_T(const Analysis &Analysis_) const; + +#if ENABLE_DEBUG + + std::wstring score_DEBUG(const Analysis &Analysis_) const; + +#endif // ENABLE_DEBUG + +}; +} + +#endif // STREAM_5_3_1_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_1_tagger_trainer.h (revision 69632) @@ -0,0 +1,41 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_1_TAGGER_TRAINER_H +#define STREAM_5_3_1_TAGGER_TRAINER_H + +#include "basic_5_3_1_tagger.h" +#include "basic_stream_tagger_trainer.h" + +#include "analysis.h" +#include "stream.h" + +#include + +namespace Apertium { +class Stream_5_3_1_TaggerTrainer : private basic_5_3_1_Tagger, + public basic_StreamTaggerTrainer { +public: + Stream_5_3_1_TaggerTrainer(const Flags &Flags_); + void serialise(std::ostream &Serialised_basic_Tagger) const; + +private: + void train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_); + void multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier); +}; +} + +#endif // STREAM_5_3_1_TAGGER_TRAINER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.cc (revision 69632) @@ -0,0 +1,104 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_2_tagger.h" + +#include "apertium_config.h" + +#include "a.h" +#include "analysis.h" +#include "deserialiser.h" +#include "lemma.h" + +#include +#include +#include + +#if ENABLE_DEBUG + +#include +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +Stream_5_3_2_Tagger::Stream_5_3_2_Tagger(const Flags &Flags_) + : basic_5_3_2_Tagger(), basic_StreamTagger(Flags_) {} + +void Stream_5_3_2_Tagger::deserialise(std::istream &Serialised_basic_Tagger) { + Model = + Deserialiser > >::deserialise( + Serialised_basic_Tagger); +} + +long double Stream_5_3_2_Tagger::score(const Analysis &Analysis_) const { + return (tokenCount_r_a(Analysis_) * tokenCount_a(Analysis_)) / + (tokenCount_a(Analysis_) + typeCount_a(Analysis_)); +} + +long double +Stream_5_3_2_Tagger::tokenCount_r_a(const Analysis &Analysis_) const { + if (Model.find(a(Analysis_)) == Model.end()) + return 1; + + if (Model.find(a(Analysis_))->second.find(Lemma(Analysis_)) == + Model.find(a(Analysis_))->second.end()) + return 1; + + return 1 + Model.find(a(Analysis_))->second.find(Lemma(Analysis_))->second; +} + +long double Stream_5_3_2_Tagger::tokenCount_a(const Analysis &Analysis_) const { + if (Model.find(a(Analysis_)) == Model.end()) + return 1; + + long double tokenCount_a_ = 1; + + for (std::map::const_iterator Lemma_ = + Model.find(a(Analysis_))->second.begin(); + Lemma_ != Model.find(a(Analysis_))->second.end(); ++Lemma_) { + tokenCount_a_ += Lemma_->second; + } + + return tokenCount_a_; +} + +long double Stream_5_3_2_Tagger::typeCount_a(const Analysis &Analysis_) const { + if (Model.find(a(Analysis_)) == Model.end()) + return 1; + + return (Model.find(a(Analysis_))->second.find(Lemma(Analysis_)) == + Model.find(a(Analysis_))->second.end() + ? 1 + : 0) + + Model.find(a(Analysis_))->second.size(); +} + +#if ENABLE_DEBUG + +std::wstring Stream_5_3_2_Tagger::score_DEBUG(const Analysis &Analysis_) const { + std::wstringstream score_DEBUG_; + + score_DEBUG_ << L"(" << tokenCount_r_a(Analysis_) << L" * " + << tokenCount_a(Analysis_) << L") /\n (" + << tokenCount_a(Analysis_) << L" + " << typeCount_a(Analysis_) + << L")"; + + return score_DEBUG_.str(); +} + +#endif // ENABLE_DEBUG + +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger.h (revision 69632) @@ -0,0 +1,55 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_2_TAGGER_H +#define STREAM_5_3_2_TAGGER_H + +#include "apertium_config.h" + +#include "analysis.h" +#include "basic_5_3_2_tagger.h" +#include "basic_stream_tagger.h" + +#include + +#if ENABLE_DEBUG + +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +class Stream_5_3_2_Tagger : private basic_5_3_2_Tagger, + public basic_StreamTagger { +public: + Stream_5_3_2_Tagger(const Flags &Flags_); + void deserialise(std::istream &Serialised_basic_Tagger); + +private: + long double score(const Analysis &Analysis_) const; + long double tokenCount_r_a(const Analysis &Analysis_) const; + long double tokenCount_a(const Analysis &Analysis_) const; + long double typeCount_a(const Analysis &Analysis_) const; + +#if ENABLE_DEBUG + + std::wstring score_DEBUG(const Analysis &Analysis_) const; + +#endif // ENABLE_DEBUG + +}; +} + +#endif // STREAM_5_3_2_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_2_tagger_trainer.h (revision 69632) @@ -0,0 +1,38 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_2_TAGGER_TRAINER_H +#define STREAM_5_3_2_TAGGER_TRAINER_H + +#include "basic_5_3_2_tagger.h" +#include "basic_stream_tagger_trainer.h" + +#include + +namespace Apertium { +class Stream_5_3_2_TaggerTrainer : private basic_5_3_2_Tagger, + public basic_StreamTaggerTrainer { +public: + Stream_5_3_2_TaggerTrainer(const Flags &Flags_); + void serialise(std::ostream &Serialised_basic_Tagger) const; + +private: + void train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_); + void multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier); +}; +} + +#endif // STREAM_5_3_2_TAGGER_TRAINER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.cc (revision 69632) @@ -0,0 +1,223 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "stream_5_3_3_tagger.h" + +#include "apertium_config.h" + +#include "analysis.h" +#include "deserialiser.h" +#include "i.h" +#include "lemma.h" +#include "morpheme.h" + +#include + +#if ENABLE_DEBUG + +#include +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +Stream_5_3_3_Tagger::Stream_5_3_3_Tagger(const Flags &Flags_) + : basic_StreamTagger(Flags_) {} + +void Stream_5_3_3_Tagger::deserialise(std::istream &Serialised_basic_Tagger) { + Model = Deserialiser< + std::pair >, + std::pair >, + std::map > > > >:: + deserialise(Serialised_basic_Tagger); +} + +long double Stream_5_3_3_Tagger::score(const Analysis &Analysis_) const { + long double score = tokenCount_r_i(Analysis_) * tokenCount_i(Analysis_), + score_Divisor = tokenCount_i(Analysis_) + typeCount_i(Analysis_); + + for (std::vector::const_iterator Morpheme_ = + Analysis_.TheMorphemes.begin() + 1; + Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) { + score *= tokenCount_d_i_Morpheme(Lemma(*Morpheme_), i(*(Morpheme_ - 1))) * + tokenCount_i_d_Morpheme(i(*Morpheme_), Lemma(*Morpheme_)); + score_Divisor *= + (tokenCount_i_Morpheme(i(*(Morpheme_ - 1))) + + typeCount_i_Morpheme(i(*(Morpheme_ - 1)), Lemma(*Morpheme_))) * + (tokenCount_d_Morpheme(Lemma(*Morpheme_)) + + typeCount_d_Morpheme(Lemma(*Morpheme_), i(*Morpheme_))); + } + + return score / score_Divisor; +} + +long double +Stream_5_3_3_Tagger::tokenCount_r_i(const Analysis &Analysis_) const { + if (Model.first.find(i(Analysis_)) == Model.first.end()) + return 1; + + if (Model.first.find(i(Analysis_))->second.find(Lemma(Analysis_)) == + Model.first.find(i(Analysis_))->second.end()) + return 1; + + return 1 + + Model.first.find(i(Analysis_))->second.find(Lemma(Analysis_))->second; +} + +long double Stream_5_3_3_Tagger::tokenCount_i(const Analysis &Analysis_) const { + if (Model.first.find(i(Analysis_)) == Model.first.end()) + return 1; + + long double tokenCount_i_ = 1; + + for (std::map::const_iterator Lemma_ = + Model.first.find(i(Analysis_))->second.begin(); + Lemma_ != Model.first.find(i(Analysis_))->second.end(); ++Lemma_) { + tokenCount_i_ += Lemma_->second; + } + + return tokenCount_i_; +} + +long double Stream_5_3_3_Tagger::typeCount_i(const Analysis &Analysis_) const { + if (Model.first.find(i(Analysis_)) == Model.first.end()) + return 1; + + return (Model.first.find(i(Analysis_))->second.find(Lemma(Analysis_)) == + Model.first.find(i(Analysis_))->second.end() + ? 1 + : 0) + + Model.first.find(i(Analysis_))->second.size(); +} + +long double Stream_5_3_3_Tagger::tokenCount_d_i_Morpheme(const Lemma &Lemma_, + const i &i_) const { + if (Model.second.first.find(i_) == Model.second.first.end()) + return 1; + + if (Model.second.first.find(i_)->second.find(Lemma_) == + Model.second.first.find(i_)->second.end()) + return 1; + + return 1 + Model.second.first.find(i_)->second.find(Lemma_)->second; +} + +long double +Stream_5_3_3_Tagger::tokenCount_i_d_Morpheme(const i &i_, + const Lemma &Lemma_) const { + if (Model.second.second.find(Lemma_) == Model.second.second.end()) + return 1; + + if (Model.second.second.find(Lemma_)->second.find(i_) == + Model.second.second.find(Lemma_)->second.end()) + return 1; + + return 1 + Model.second.second.find(Lemma_)->second.find(i_)->second; +} + +long double Stream_5_3_3_Tagger::tokenCount_i_Morpheme(const i &i_) const { + if (Model.second.first.find(i_) == Model.second.first.end()) + return 1; + + long double typeCount_i_Morpheme_ = 1; + + for (std::map::const_iterator Lemma_ = + Model.second.first.find(i_)->second.begin(); + Lemma_ != Model.second.first.find(i_)->second.end(); ++Lemma_) { + typeCount_i_Morpheme_ += Lemma_->second; + } + + return typeCount_i_Morpheme_; +} + +long double +Stream_5_3_3_Tagger::typeCount_i_Morpheme(const i &i_, + const Lemma &Lemma_) const { + if (Model.second.first.find(i_) == Model.second.first.end()) + return 1; + + return (Model.second.first.find(i_)->second.find(Lemma_) == + Model.second.first.find(i_)->second.end() + ? 1 + : 0) + + Model.second.first.find(i_)->second.size(); +} + +long double +Stream_5_3_3_Tagger::tokenCount_d_Morpheme(const Lemma &Lemma_) const { + if (Model.second.second.find(Lemma_) == Model.second.second.end()) + return 1; + + long double tokenCount_d_Morpheme_ = 1; + + for (std::map::const_iterator i_ = + Model.second.second.find(Lemma_)->second.begin(); + i_ != Model.second.second.find(Lemma_)->second.end(); ++i_) { + tokenCount_d_Morpheme_ += i_->second; + } + + return tokenCount_d_Morpheme_; +} + +long double Stream_5_3_3_Tagger::typeCount_d_Morpheme(const Lemma &Lemma_, + const i &i_) const { + if (Model.second.second.find(Lemma_) == Model.second.second.end()) + return 1; + + return (Model.second.second.find(Lemma_)->second.find(i_) == + Model.second.second.find(Lemma_)->second.end() + ? 1 + : 0) + + Model.second.second.find(Lemma_)->second.size(); +} + +#if ENABLE_DEBUG + +std::wstring Stream_5_3_3_Tagger::score_DEBUG(const Analysis &Analysis_) const { + std::wstringstream score_DEBUG_; + + score_DEBUG_ << L"(" << tokenCount_r_i(Analysis_) << L" * " + << tokenCount_i(Analysis_); + + for (std::vector::const_iterator Morpheme_ = + Analysis_.TheMorphemes.begin() + 1; + Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) { + score_DEBUG_ << L" * " << tokenCount_d_i_Morpheme(Lemma(*Morpheme_), + i(*(Morpheme_ - 1))) + << L" * " + << tokenCount_i_d_Morpheme(i(*Morpheme_), Lemma(*Morpheme_)); + } + + score_DEBUG_ << L") /\n [(" << tokenCount_i(Analysis_) << L" + " + << typeCount_i(Analysis_) << L")"; + + for (std::vector::const_iterator Morpheme_ = + Analysis_.TheMorphemes.begin() + 1; + Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) { + score_DEBUG_ << L" * (" << tokenCount_i_Morpheme(i(*(Morpheme_ - 1))) + << L" + " + << typeCount_i_Morpheme(i(*(Morpheme_ - 1)), Lemma(*Morpheme_)) + << L") * (" << tokenCount_d_Morpheme(Lemma(*Morpheme_)) + << L" + " + << typeCount_d_Morpheme(Lemma(*Morpheme_), i(*Morpheme_)) + << L")"; + } + + score_DEBUG_ << L"]"; + return score_DEBUG_.str(); +} + +#endif // ENABLE_DEBUG +} Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger.h (revision 69632) @@ -0,0 +1,62 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_3_TAGGER_H +#define STREAM_5_3_3_TAGGER_H + +#include "apertium_config.h" + +#include "analysis.h" +#include "basic_5_3_3_tagger.h" +#include "basic_stream_tagger.h" +#include "i.h" +#include "lemma.h" + +#include + +#if ENABLE_DEBUG + +#include + +#endif // ENABLE_DEBUG + +namespace Apertium { +class Stream_5_3_3_Tagger : private basic_5_3_3_Tagger, + public basic_StreamTagger { +public: + Stream_5_3_3_Tagger(const Flags &Flags_); + void deserialise(std::istream &Serialised_basic_Tagger); + +private: + long double score(const Analysis &Analysis_) const; + long double tokenCount_r_i(const Analysis &Analysis_) const; + long double tokenCount_i(const Analysis &Analysis_) const; + long double typeCount_i(const Analysis &Analysis_) const; + long double tokenCount_d_i_Morpheme(const Lemma &Lemma_, const i &i_) const; + long double tokenCount_i_d_Morpheme(const i &i_, const Lemma &Lemma_) const; + long double tokenCount_i_Morpheme(const i &i_) const; + long double typeCount_i_Morpheme(const i &i_, const Lemma &Lemma_) const; + long double tokenCount_d_Morpheme(const Lemma &Lemma_) const; + long double typeCount_d_Morpheme(const Lemma &Lemma_, const i &i_) const; + +#if ENABLE_DEBUG + + std::wstring score_DEBUG(const Analysis &Analysis_) const; + +#endif // ENABLE_DEBUG +}; +} + +#endif // STREAM_5_3_3_TAGGER_H Index: branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/stream_5_3_3_tagger_trainer.h (revision 69632) @@ -0,0 +1,39 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAM_5_3_3_TAGGER_TRAINER_H +#define STREAM_5_3_3_TAGGER_TRAINER_H + +#include "analysis.h" +#include "basic_5_3_3_tagger.h" +#include "basic_stream_tagger_trainer.h" + +#include + +namespace Apertium { +class Stream_5_3_3_TaggerTrainer : private basic_5_3_3_Tagger, + public basic_StreamTaggerTrainer { +public: + Stream_5_3_3_TaggerTrainer(const Flags &Flags_); + void serialise(std::ostream &Serialised_basic_Tagger) const; + +private: + void train_Analysis(const Analysis &Analysis_, + const std::size_t &Coefficient_); + void multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier); +}; +} + +#endif // STREAM_5_3_3_TAGGER_TRAINER_H Index: branches/apertium-tagger/apertium2/apertium/streamed_type.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/streamed_type.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/streamed_type.h (revision 69632) @@ -0,0 +1,32 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef STREAMED_TYPE_H +#define STREAMED_TYPE_H + +#include "lexical_unit.h" +#include "optional.h" + +#include + +namespace Apertium { +class StreamedType { +public: + std::wstring TheString; + Optional TheLexicalUnit; +}; +} + +#endif // STREAMED_TYPE_H Index: branches/apertium-tagger/apertium2/apertium/tag.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tag.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tag.cc (revision 69632) @@ -0,0 +1,34 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "tag.h" + +#include "exception.h" + +#include + +namespace Apertium { +bool operator==(const Tag &a, const Tag &b) { return a.TheTag == b.TheTag; } + +bool operator<(const Tag &a, const Tag &b) { return a.TheTag < b.TheTag; } + +Tag::operator std::wstring() const { + if (TheTag.empty()) + throw Exception::Tag::TheTags_empty("can't convert Tag comprising empty " + "TheTag std::wstring to std::wstring"); + + return L"<" + TheTag + L">"; +} +} Index: branches/apertium-tagger/apertium2/apertium/tag.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tag.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tag.h (revision 69632) @@ -0,0 +1,31 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef TAG_H +#define TAG_H + +#include + +namespace Apertium { +class Tag { +public: + friend bool operator==(const Tag &a, const Tag &b); + friend bool operator<(const Tag &a, const Tag &b); + operator std::wstring() const; + std::wstring TheTag; +}; +} + +#endif // TAG_H Index: branches/apertium-tagger/apertium2/apertium/wchar_t_exception.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/wchar_t_exception.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/wchar_t_exception.h (revision 69632) @@ -0,0 +1,53 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef WCHAR_T_EXCEPTION_H +#define WCHAR_T_EXCEPTION_H + +#include "wchar_t_exception_type.h" + +#include +#include + +namespace Apertium { +namespace wchar_t_Exception { + +#define WCHAR_T_EXCEPTION(WCHAR_T_EXCEPTION_TYPE) \ + class WCHAR_T_EXCEPTION_TYPE : public ::Apertium::wchar_t_ExceptionType { \ + public: \ + WCHAR_T_EXCEPTION_TYPE(const wchar_t *wchar_t_what_) \ + : wchar_t_ExceptionType(wchar_t_what_) {} \ + WCHAR_T_EXCEPTION_TYPE(const std::wstring &wchar_t_what_) \ + : wchar_t_ExceptionType(wchar_t_what_) {} \ + WCHAR_T_EXCEPTION_TYPE(const std::wstringstream &wchar_t_what_) \ + : wchar_t_ExceptionType(wchar_t_what_) {} \ + ~WCHAR_T_EXCEPTION_TYPE() throw() {} \ + }; + +namespace Stream { +WCHAR_T_EXCEPTION(TheCharacterStream_not_good) +WCHAR_T_EXCEPTION(UnexpectedAnalysis) +WCHAR_T_EXCEPTION(UnexpectedCase) +WCHAR_T_EXCEPTION(UnexpectedCharacter) +WCHAR_T_EXCEPTION(UnexpectedEndOfFile) +WCHAR_T_EXCEPTION(UnexpectedLemma) +WCHAR_T_EXCEPTION(UnexpectedPreviousCase) +} + +#undef WCHAR_T_EXCEPTION +} +} + +#endif // WCHAR_T_EXCEPTION_H Index: branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.cc (revision 69632) @@ -0,0 +1,90 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include "wchar_t_exception_type.h" + +#include "exception.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +void swap(wchar_t_ExceptionType &a, wchar_t_ExceptionType &b) { + using std::swap; + + swap(a.what_, b.what_); +} + +wchar_t_ExceptionType::wchar_t_ExceptionType(const wchar_t *wchar_t_what_) + : what_(new char[size(wchar_t_what_)]) { + constructor(wchar_t_what_); +} + +wchar_t_ExceptionType::wchar_t_ExceptionType(const std::wstring &wchar_t_what_) + : what_(new char[size(wchar_t_what_.c_str())]) { + constructor(wchar_t_what_.c_str()); +} + +wchar_t_ExceptionType::wchar_t_ExceptionType( + const std::wstringstream &wchar_t_what_) + : what_(new char[size(wchar_t_what_.str().c_str())]) { + constructor(wchar_t_what_.str().c_str()); +} + +wchar_t_ExceptionType::wchar_t_ExceptionType( + const wchar_t_ExceptionType &wchar_t_ExceptionType_) + : what_(new char[std::strlen(wchar_t_ExceptionType_.what_) + 1]) { + std::strcpy(what_, wchar_t_ExceptionType_.what_); +} + +wchar_t_ExceptionType &wchar_t_ExceptionType:: +operator=(wchar_t_ExceptionType wchar_t_ExceptionType_) { + swap(*this, wchar_t_ExceptionType_); + return *this; +} + +wchar_t_ExceptionType::~wchar_t_ExceptionType() throw() { delete[] what_; } + +const char *wchar_t_ExceptionType::what() const throw() { return what_; } + +std::size_t wchar_t_ExceptionType::size(const wchar_t *wchar_t_what_) { + std::mbstate_t ps = {0}; + errno = 0; + std::size_t size_ = std::wcsrtombs(NULL, &wchar_t_what_, 0, &ps); + + if (errno == EILSEQ) + throw Exception::wchar_t_ExceptionType::EILSEQ_( + "can't convert const wchar_t *wchar_t_what_ to char * : unexpected " + "wide character"); + + return size_ + 1; +} + +void wchar_t_ExceptionType::constructor(const wchar_t *wchar_t_what_) { + std::mbstate_t ps = {0}; + errno = 0; + std::wcsrtombs(what_, &wchar_t_what_, size(wchar_t_what_), &ps); + + if (errno == EILSEQ) + throw Exception::wchar_t_ExceptionType::EILSEQ_( + "can't convert const wchar_t *const wchar_t_what_ to char *what_: " + "unexpected wide character"); +} +} Index: branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/wchar_t_exception_type.h (revision 69632) @@ -0,0 +1,45 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#ifndef WCHAR_T_EXCEPTION_TYPE_H +#define WCHAR_T_EXCEPTION_TYPE_H + +#include "basic_exception_type.h" + +#include +#include +#include + +namespace Apertium { +class wchar_t_ExceptionType : public basic_ExceptionType { +public: + friend void swap(wchar_t_ExceptionType &a, wchar_t_ExceptionType &b); + wchar_t_ExceptionType(const wchar_t *wchar_t_what_); + wchar_t_ExceptionType(const std::wstring &wchar_t_what_); + wchar_t_ExceptionType(const std::wstringstream &wchar_t_what_); + wchar_t_ExceptionType(const wchar_t_ExceptionType &wchar_t_ExceptionType_); + wchar_t_ExceptionType & + operator=(wchar_t_ExceptionType wchar_t_ExceptionType_); + virtual ~wchar_t_ExceptionType() throw(); + const char *what() const throw(); + +private: + static std::size_t size(const wchar_t *wchar_t_what_); + void constructor(const wchar_t *wchar_t_what_); + char *what_; +}; +} + +#endif // WCHAR_T_EXCEPTION_TYPE_H Index: branches/apertium-tagger/apertium2/apertium/apertium-desmediawiki.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-desmediawiki.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-desmediawiki.1 (revision 69632) @@ -0,0 +1,46 @@ +.TH apertium-desmediawiki 1 2009-08-30 "" "" +.SH NAME +apertium-desmediawiki \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-desmediawiki +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-desmediawiki +is a processor for mediawiki XML dumps (i.e., those produced using +Special:Export. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of a text file and produces output suitable for +processing with lt-proc. Format information (newlines, tabs, etc.) is enclosed in brackets so that lt-proc treats it as whitespace between words. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the word "gener" is analysed: +.TP +echo "gener" | apertium-destxt | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-deshtml\fR(1), +.I apertium-desrtf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Complicated links - [[page|alternative text]], [[link]]s, etc. are not +supported. +.PP +The mediawiki parser has special support for mixing apostrophes and +apostrophes as formatting. This is not supported either. +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-header.sh (revision 69632) @@ -0,0 +1,660 @@ +# -*- sh-basic-offset: 2 -*- + +# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + + +message () +{ + echo "USAGE: $(basename $0) [-d datadir] [-f format] [-u] [in [out]]" + echo " -d datadir directory of linguistic data" + echo " -f format one of: txt (default), html, rtf, odt, docx, wxml, xlsx, pptx," + echo " xpresstag, html-noent, latex, latex-raw" + echo " -a display ambiguity" + echo " -u don't display marks '*' for unknown words" + echo " -n don't insert period before possible sentence-ends" + echo " -m memory.tmx use a translation memory to recycle translations" + echo " -o direction translation direction using the translation memory," + echo " by default 'direction' is used instead" + echo " -l lists the available translation directions and exits" + echo " direction typically, LANG1-LANG2, but see modes.xml in language data" + echo " in input file (stdin by default)" + echo " out output file (stdout by default)" + exit 1 +} + +list_directions () +{ + for mode in "$DATADIR"/modes/*.mode; do + echo " $(basename "${mode%%.mode}")" + done +} + +locale_utf8 () +{ + export LC_CTYPE=$(locale -a|grep -i "utf[.]*8"|head -1); + if [ LC_CTYPE = "" ]; then + echo "Error: Install an UTF-8 locale in your system"; + exit 1; + fi +} + +locale_latin1 () +{ + export LC_CTYPE=$(locale -a|grep -i -e "8859-1" -e "@euro"|head -1); + if [ LC_CTYPE = "" ]; then + echo "Error: Install a Latin-1 locale in your system"; + exit 1; + fi +} + +test_zip () +{ + if [ "$(which zip)" = "" ]; then + echo "Error: Install 'zip' command in your system"; + exit 1; + fi + + if [ "$(which unzip)" = "" ]; then + echo "Error: Install 'unzip' command in your system"; + exit 1; + fi +} + +test_gawk () +{ + GAWK=$(which gawk) + if [ "$GAWK" = "" ]; then + echo "Error: Install 'gawk' in your system" + exit 1 + fi +} + + +translate_latex() +{ + test_gawk + + if [ "$INFILE" = "" -o "$INFILE" = /dev/stdin ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + + if [ "$(file -b --mime-encoding "$INFILE")" == "utf-8" ]; then + locale_latin1 + else locale_utf8 + fi + + "$APERTIUM_PATH/apertium-prelatex" "$INFILE" | \ + "$APERTIUM_PATH/apertium-utils-fixlatex" | \ + "$APERTIUM_PATH/apertium-deslatex" ${FORMAT_OPTIONS} | \ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-relatex"| \ + awk '{gsub("", ""); print;}' | \ + if [ "$REDIR" == "" ]; then "$APERTIUM_PATH/apertium-postlatex-raw"; else "$APERTIUM_PATH/apertium-postlatex-raw" > "$SALIDA"; fi + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE" + fi +} + + +translate_latex_raw() +{ + test_gawk + + if [ "$INFILE" = "" -o "$INFILE" = /dev/stdin ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + + if [ "$(file -b --mime-encoding "$INFILE")" = "utf-8" ]; then + locale_latin1 + else locale_utf8 + fi + + "$APERTIUM_PATH/apertium-prelatex" "$INFILE" | \ + "$APERTIUM_PATH/apertium-utils-fixlatex" | \ + "$APERTIUM_PATH/apertium-deslatex" ${FORMAT_OPTIONS} | \ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-relatex"| \ + awk '{gsub("", ""); print;}' | \ + if [ "$REDIR" == "" ]; then "$APERTIUM_PATH/apertium-postlatex-raw"; else "$APERTIUM_PATH/apertium-postlatex-raw" > "$SALIDA"; fi +} + + +translate_odt () +{ + INPUT_TMPDIR=$(mktemp -d "$TMPDIR/apertium.XXXXXXXX") + + locale_utf8 + test_zip + + if [ "$INFILE" = "" ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + OTRASALIDA=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + + unzip -q -o -d "$INPUT_TMPDIR" "$INFILE" + find "$INPUT_TMPDIR" | grep "content\\.xml\\|styles\\.xml" |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + "$APERTIUM_PATH/apertium-desodt" ${FORMAT_OPTIONS} |\ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-reodt"|\ + awk '{punto = index($0, "/>") + 3; cabeza = substr($0, 1, punto-1); cola = substr($0, punto); n1 = substr(cabeza, index(cabeza, "\"")+1); name = substr(n1, 1, index(n1, "\"")-1); gsub("[?]> ", "?>\n", cola); print cola > name;}' + VUELVE=$(pwd) + cd "$INPUT_TMPDIR" + rm -Rf ObjectReplacements + zip -q -r - . >"$OTRASALIDA" + cd "$VUELVE" + rm -Rf "$INPUT_TMPDIR" + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE"; + fi + + if [ "$REDIR" == "" ]; then cat "$OTRASALIDA"; else cat "$OTRASALIDA" > "$SALIDA"; fi + rm -Rf "$OTRASALIDA" + rm -Rf "$TMCOMPFILE" +} + +translate_docx () +{ + INPUT_TMPDIR=$(mktemp -d "$TMPDIR/apertium.XXXXXXXX") + + locale_utf8 + test_zip + + if [ "$INFILE" = "" ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + OTRASALIDA=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + + if [ "$UWORDS" = "no" ]; then + OPCIONU="-u"; + else OPCIONU=""; + fi + + unzip -q -o -d "$INPUT_TMPDIR" "$INFILE" + + for i in $(find "$INPUT_TMPDIR"|grep "xlsx$"); + do LOCALTEMP=$(mktemp "$TMPDIR/apertium.XXXXXXXX"); + "$APERTIUM_PATH/apertium" -f xlsx -d "$DATADIR" "$OPCIONU" "$PAIR" <"$i" >"$LOCALTEMP"; + cp "$LOCALTEMP" "$i"; + rm "$LOCALTEMP"; + done; + + find "$INPUT_TMPDIR" | grep "xml" |\ + grep -v -i \\\(settings\\\|theme\\\|styles\\\|font\\\|rels\\\|docProps\\\) |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + "$APERTIUM_PATH/apertium-deswxml" ${FORMAT_OPTIONS} |\ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-rewxml"|\ + awk '{punto = index($0, "/>") + 3; cabeza = substr($0, 1, punto-1); cola = substr($0, punto); n1 = substr(cabeza, index(cabeza, "\"")+1); name = substr(n1, 1, index(n1, "\"")-1); gsub("[?]> ", "?>\n", cola); print cola > name;}' + VUELVE=$(pwd) + cd "$INPUT_TMPDIR" + zip -q -r - . >"$OTRASALIDA" + cd "$VUELVE" + rm -Rf "$INPUT_TMPDIR" + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE"; + fi + + if [ "$REDIR" == "" ]; then cat "$OTRASALIDA"; else cat "$OTRASALIDA" > "$SALIDA"; fi + rm -Rf "$OTRASALIDA" + rm -Rf "$TMCOMPFILE" +} + +translate_pptx () +{ + INPUT_TMPDIR=$(mktemp -d "$TMPDIR/apertium.XXXXXXXX") + + locale_utf8 + test_zip + + if [ "$INFILE" = "" ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + OTRASALIDA=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + + if [ "$UWORDS" = "no" ]; then + OPCIONU="-u"; + else OPCIONU=""; + fi + + unzip -q -o -d "$INPUT_TMPDIR" "$INFILE" + + for i in $(find "$INPUT_TMPDIR"|grep "xlsx$"); do + LOCALTEMP=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + "$APERTIUM_PATH/apertium" -f xlsx -d "$DATADIR" "$OPCIONU" "$PAIR" <"$i" >"$LOCALTEMP"; + cp "$LOCALTEMP" "$i" + rm "$LOCALTEMP" + done; + + find "$INPUT_TMPDIR" | grep "xml$" |\ + grep "slides\/slide" |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + "$APERTIUM_PATH/apertium-despptx" ${FORMAT_OPTIONS} |\ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-repptx" |\ + awk '{punto = index($0, "/>") + 3; cabeza = substr($0, 1, punto-1); cola = substr($0, punto); n1 = substr(cabeza, index(cabeza, "\"")+1); name = substr(n1, 1, index(n1, "\"")-1); gsub("[?]> ", "?>\n", cola); print cola > name;}' + VUELVE=$(pwd) + cd "$INPUT_TMPDIR" + zip -q -r - . >"$OTRASALIDA" + cd "$VUELVE" + rm -Rf "$INPUT_TMPDIR" + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE"; + fi + + if [ "$REDIR" == "" ]; then cat "$OTRASALIDA"; else cat "$OTRASALIDA" > "$SALIDA"; fi + rm -Rf "$OTRASALIDA" + rm -Rf "$TMCOMPFILE" +} + + +translate_xlsx () +{ + INPUT_TMPDIR=$(mktemp -d "$TMPDIR/apertium.XXXXXXXX") + + locale_utf8 + test_zip + + if [ "$INFILE" = "" ]; then + INFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + cat > "$INFILE" + BORRAFICHERO="true" + fi + OTRASALIDA=$(mktemp "$TMPDIR/apertium.XXXXXXXX") + + unzip -q -o -d "$INPUT_TMPDIR" "$INFILE" + find "$INPUT_TMPDIR" | grep "sharedStrings.xml" |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + "$APERTIUM_PATH/apertium-desxlsx" ${FORMAT_OPTIONS} |\ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then cat; + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | \ + if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | \ + "$APERTIUM_PATH/apertium-rexlsx" |\ + awk '{punto = index($0, "/>") + 3; cabeza = substr($0, 1, punto-1); cola = substr($0, punto); n1 = substr(cabeza, index(cabeza, "\"")+1); name = substr(n1, 1, index(n1, "\"")-1); gsub("[?]> ", "?>\n", cola); print cola > name;}' + VUELVE=$(pwd) + cd "$INPUT_TMPDIR" + zip -q -r - . >"$OTRASALIDA" + cd "$VUELVE" + rm -Rf "$INPUT_TMPDIR" + + if [ "$BORRAFICHERO" = "true" ]; then + rm -Rf "$INFILE"; + fi + + if [ "$REDIR" == "" ]; then cat "$OTRASALIDA"; else cat "$OTRASALIDA" > "$SALIDA"; fi + rm -Rf "$OTRASALIDA" + rm -Rf "$TMCOMPFILE" +} + +translate_htmlnoent () +{ + "$APERTIUM_PATH/apertium-deshtml" ${FORMAT_OPTIONS} "$INFILE" | \ + if [ "$TRANSLATION_MEMORY_FILE" = "" ]; then + cat + else "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE"; + fi | if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | if [ "$FORMAT" = "none" ]; then + if [ "$REDIR" == "" ]; then cat; else cat > "$SALIDA"; fi + else if [ "$REDIR" == "" ]; then "$APERTIUM_PATH/apertium-rehtml-noent"; else "$APERTIUM_PATH/apertium-rehtml-noent" > "$SALIDA"; fi + fi + + rm -Rf "$TMCOMPFILE" +} + + + + + +########################################################## +# Option and argument parsing, setting globals variables # +########################################################## +PATH="${APERTIUM_PATH}:${PATH}" +[[ -z $TMPDIR ]] && TMPDIR=/tmp +TMCOMPFILE=$(mktemp "$TMPDIR/apertium.XXXXXXXX") +trap 'rm -Rf "$TMCOMPFILE"' EXIT + +# Default values, may be overridden below: +PAIR="" +INFILE="/dev/stdin" +FORMAT="txt" +DATADIR=$DEFAULT_DIRECTORY +TRANSLATION_MEMORY_DIRECTION=$PAIR +LIST_MODES_AND_EXIT=false +FORMAT_OPTIONS="" + +# Skip (but store) non-option arguments that come before options: +declare -a ARGS_PREOPT +declare -i OPTIND=1 +while [[ $OPTIND -le $# ]]; do + arg=${@:$OPTIND:1} + case $arg in + -*) break ;; + *) ARGS_PREOPT+=($arg); (( OPTIND++ )) ;; + esac +done + + +while getopts ":uahlf:d:m:o:n" opt; do + case "$opt" in + f) FORMAT=$OPTARG ;; + d) DATADIR=$OPTARG ;; + m) TRANSLATION_MEMORY_FILE=$OPTARG ;; + o) TRANSLATION_MEMORY_DIRECTION=$OPTARG ;; + u) UWORDS="no" ;; + n) FORMAT_OPTIONS="-n" ;; + a) OPTION_TAGGER="-m" ;; + l) LIST_MODES_AND_EXIT=true ;; + h) message ;; + \?) echo "ERROR: Unknown option $OPTARG"; message ;; + :) echo "ERROR: $OPTARG requires an argument"; message ;; + esac +done +shift $(($OPTIND-1)) + +if $LIST_MODES_AND_EXIT; then list_directions; exit 0; fi + +# Restore non-option arguments that came before options back into arg list: +set -- "${ARGS_PREOPT[@]}" "$@" + +case "$#" in + 3) + SALIDA=$3 + REDIR=">" + INFILE=$2 + PAIR=$1 + if [[ ! -e "$INFILE" ]]; then + echo "Error: file '$INFILE' not found." + message + fi + ;; + 2) + INFILE=$2 + PAIR=$1 + if [[ ! -e "$INFILE" ]]; then + echo "Error: file '$INFILE' not found." + message + fi + ;; + 1) + PAIR=$1 + ;; + *) + message + ;; +esac + + +if [[ -n $TRANSLATION_MEMORY_FILE ]]; then + "$APERTIUM_PATH/lt-tmxcomp" "$TRANSLATION_MEMORY_DIRECTION" "$TRANSLATION_MEMORY_FILE" "$TMCOMPFILE" >/dev/null + if [ "$?" != "0" ]; then + echo "Error: Cannot compile TM '$TRANSLATION_MEMORY_FILE'" + echo" hint: use -o parameter" + message + fi +fi + +if [[ ! -d "$DATADIR/modes" ]]; then + echo "Error: Directory '$DATADIR/modes' does not exist." + message +fi + +if [[ ! -e "$DATADIR/modes/$PAIR.mode" ]]; then + echo -n "Error: Mode $PAIR does not exist" + c=$(find "$DATADIR/modes"|wc -l) + if [ "$c" -le 1 ]; then + echo "." + else + echo ". Try one of:" + list_directions + fi + exit 1 +fi + +#Parametro opcional, de no estar, lee de la entrada estandar (stdin) + +case "$FORMAT" in + none) + if [ "$UWORDS" = "no" ]; then + OPTION="-n"; + else OPTION="-g"; + fi + ;; + txt|rtf|html|xpresstag|mediawiki) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + ;; + rtf) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + MILOCALE=$(locale -a|grep -i -v "utf\|^C$\|^POSIX$"|head -1); + if [ "$MILOCALE" = "" ]; then + echo "Error: Install a ISO-8859-1 compatible locale in your system"; + exit 1; + fi + export LC_CTYPE=$MILOCALE + ;; + + odt) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_odt + exit 0 + ;; + latex) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_latex + exit 0 + ;; + latex-raw) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_latex_raw + exit 0 + ;; + + + docx) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_docx + exit 0 + ;; + xlsx) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_xlsx + exit 0 + ;; + pptx) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_pptx + exit 0 + ;; + html-noent) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + translate_htmlnoent + exit 0 + ;; + + wxml) + if [ "$UWORDS" = "no" ]; then OPTION="-n"; + else OPTION="-g"; + fi; + locale_utf8 + ;; + + txtu) + FORMAT="txt"; + OPTION="-n" + ;; + htmlu) + FORMAT="html"; + OPTION="-n"; + ;; + xpresstagu) + FORMAT="xpresstag"; + OPTION="-n"; + ;; + rtfu) + FORMAT="rtf"; + OPTION="-n"; + MILOCALE=$(locale -a|grep -i -v "utf\|^C$\|^POSIX$"|head -1); + if [ "$MILOCALE" = "" ]; then + echo "Error: Install a ISO-8859-1 compatible locale in your system"; + exit 1; + fi + export LC_CTYPE=$MILOCALE + ;; + + odtu) + OPTION="-n" + translate_odt + exit 0 + ;; + + docxu) + OPTION="-n" + translate_docx + exit 0 + ;; + + xlsxu) + OPTION="-n" + translate_xlsx + exit 0 + ;; + + pptxu) + OPTION="-n" + translate_pptx + exit 0 + ;; + + wxmlu) + OPTION="-n"; + locale_utf8 + ;; + + + + *) # Por defecto asumimos txt + FORMAT="txt" + OPTION="-g" + ;; +esac + +if [ -z "$REF" ] +then + REF=$FORMAT +fi + +set -e -o pipefail + +if [ "$FORMAT" = "none" ]; then + cat "$INFILE" +else + "$APERTIUM_PATH/apertium-des$FORMAT" ${FORMAT_OPTIONS} "$INFILE" +fi | if [ "$TRANSLATION_MEMORY_FILE" = "" ]; + then + cat + else + "$APERTIUM_PATH/lt-tmxproc" "$TMCOMPFILE" + fi | if [ ! -x "$DATADIR/modes/$PAIR.mode" ]; then + sh "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + else + "$DATADIR/modes/$PAIR.mode" "$OPTION" "$OPTION_TAGGER" + fi | if [ "$FORMAT" = "none" ]; then + if [ "$REDIR" = "" ]; then + cat + else + cat > "$SALIDA" + fi + else + if [ "$REDIR" = "" ]; then + "$APERTIUM_PATH/apertium-re$FORMAT" + else + "$APERTIUM_PATH/apertium-re$FORMAT" > "$SALIDA" + fi + fi + Index: branches/apertium-tagger/apertium2/apertium/postchunk.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/postchunk.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/postchunk.cc (revision 69632) @@ -0,0 +1,2074 @@ +/* + * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "apertium_config.h" +#include + +using namespace Apertium; +using namespace std; + +void +Postchunk::destroy() +{ + if(me) + { + delete me; + me = NULL; + } + if(doc) + { + xmlFreeDoc(doc); + doc = NULL; + } +} + +Postchunk::Postchunk() : +word(0), +blank(0), +lword(0), +lblank(0), +output(0), +any_char(0), +any_tag(0), +nwords(0) +{ + me = NULL; + doc = NULL; + root_element = NULL; + lastrule = NULL; + inword = false; + null_flush = false; + internal_null_flush = false; +} + +Postchunk::~Postchunk() +{ + destroy(); +} + +void +Postchunk::readData(FILE *in) +{ + alphabet.read(in); + any_char = alphabet(TRXReader::ANY_CHAR); + any_tag = alphabet(TRXReader::ANY_TAG); + + Transducer t; + t.read(in, alphabet.size()); + + map finals; + + // finals + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + int key = Compression::multibyte_read(in); + finals[key] = Compression::multibyte_read(in); + } + + me = new MatchExe(t, finals); + + // attr_items + bool recompile_attrs = Compression::string_read(in) != string(pcre_version()); + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + attr_items[cad_k].read(in); + wstring fallback = Compression::wstring_read(in); + if(recompile_attrs) { + attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + } + } + + // variables + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + } + + // macros + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + macros[cad_k] = Compression::multibyte_read(in); + } + + // lists + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + + for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) + { + wstring const cad_v = Compression::wstring_read(in); + lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); + listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + } + } +} + +void +Postchunk::read(string const &transferfile, string const &datafile) +{ + readPostchunk(transferfile); + + // datafile + FILE *in = fopen(datafile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open file '" << datafile << "'." << endl; + exit(EXIT_FAILURE); + } + readData(in); + fclose(in); + +} + +void +Postchunk::readPostchunk(string const &in) +{ + doc = xmlReadFile(in.c_str(), NULL, 0); + + if(doc == NULL) + { + cerr << "Error: Could not parse file '" << in << "'." << endl; + exit(EXIT_FAILURE); + } + + root_element = xmlDocGetRootElement(doc); + + // search for macros & rules + for(xmlNode *i = root_element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) + { + collectMacros(i); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) + { + collectRules(i); + } + } + } +} + +void +Postchunk::collectRules(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + for(xmlNode *j = i->children; ; j = j->next) + { + if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "action")) + { + rule_map.push_back(j); + break; + } + } + } + } +} + +void +Postchunk::collectMacros(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + macro_map.push_back(i); + } + } +} + +bool +Postchunk::checkIndex(xmlNode *element, int index, int limit) +{ + if(index > limit) + { + wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) <line << endl; + return false; + } + return true; +} + + +string +Postchunk::evalString(xmlNode *element) +{ + map::iterator it; + it = evalStringCache.find(element); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_clip_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + } + break; + + case ti_lu_count: + return StringUtils::itoa_string(tmpword.size()); + + case ti_var: + return variables[ti.getContent()]; + + case ti_lit_tag: + case ti_lit: + return ti.getContent(); + + case ti_b: + if(checkIndex(element, ti.getPos(), lblank)) + { + if(ti.getPos() >= 0) + { + return !blank?"":*(blank[ti.getPos()]); + } + return " "; + } + break; + + case ti_get_case_from: + if(checkIndex(element, ti.getPos(), lword)) + { + return copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), + evalString((xmlNode *) ti.getPointer())); + } + break; + + case ti_case_of_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return caseOf(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); + } + break; + + default: + return ""; + } + return ""; + } + + if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *)i->children->content); + } + } + + evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) + { + evalStringCache[element] = TransferInstr(ti_lit_tag, + tags((const char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) + { + evalStringCache[element] = TransferInstr(ti_lit, string((char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) + { + if(element->properties == NULL) + { + evalStringCache[element] = TransferInstr(ti_b, " ", -1); + } + else + { + int pos = atoi((const char *) element->properties->children->content) - 1; + evalStringCache[element] = TransferInstr(ti_b, "", pos); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) + { + int pos = atoi((const char *) element->properties->children->content); + xmlNode *param = NULL; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + param = i; + break; + } + } + + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) + { + evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lu-count")) + { + evalStringCache[element] = TransferInstr(ti_lu_count, "", 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content); + } + } + + evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) + { + string value; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + value.append(evalString(i)); + } + } + return value; + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + myword.append(evalString(i)); + } + } + + if(myword != "") + { + return "^"+myword+"$"; + } + else + { + return ""; + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu")) + { + string value; + + bool first_time = true; + + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + string myword; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + + if(!first_time) + { + if(myword != "" && myword[0] != '#') //'+#' problem + { + value.append("+"); + } + } + else + { + if(myword != "") + { + first_time = false; + } + } + + value.append(myword); + } + } + + if(value != "") + { + return "^"+value+"$"; + } + else + { + return ""; + } + } + + else + { + cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + } + + return evalString(element); +} + +void +Postchunk::processOut(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "lu")) + { + string myword; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + myword.append(evalString(j)); + } + } + if(myword != "") + { + fputwc_unlocked(L'^', output); + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); + fputwc_unlocked(L'$', output); + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "mlu")) + { + fputwc_unlocked(L'^', output); + bool first_time = true; + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + string myword; + for(xmlNode *k = j->children; k != NULL; k = k->next) + { + if(k->type == XML_ELEMENT_NODE) + { + myword.append(evalString(k)); + } + } + + if(!first_time) + { + if(myword != "") + { + fputwc_unlocked('+', output); + } + } + else + { + if(myword != "") + { + first_time = false; + } + } + fputws_unlocked(UtfConverter::fromUtf8(myword).c_str(), output); + } + } + fputwc_unlocked(L'$', output); + } + else // 'b' + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); + } + } + } +} + +void +Postchunk::processTags(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (xmlChar const *) "tag")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(j)).c_str(), output); + } + } + } + } + } +} + +void +Postchunk::processInstruction(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) + { + processChoose(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) + { + processLet(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) + { + processAppend(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) + { + processOut(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) + { + processCallMacro(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) + { + processModifyCase(localroot); + } +} + +void +Postchunk::processLet(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + map::iterator it = evalStringCache.find(leftSide); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_var: + variables[ti.getContent()] = evalString(rightSide); + return; + + case ti_clip_tl: + word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide)); + return; + + default: + return; + } + } + if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = evalString(rightSide); + evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content); + } + } + + + word[pos]->setChunkPart(attr_items[(const char *) part], + evalString(rightSide)); + evalStringCache[leftSide] = TransferInstr(ti_clip_tl, (const char *) part, + pos, NULL); + } +} + +void +Postchunk::processAppend(xmlNode *localroot) +{ + string name; + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "n")) + { + name = (char *) i->children->content; + break; + } + } + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + variables[name].append(evalString(i)); + } + } +} + +void +Postchunk::processModifyCase(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content); + } + } + + string const result = copycase(evalString(rightSide), + word[pos]->chunkPart(attr_items[(const char *) part])); + word[pos]->setChunkPart(attr_items[(const char *) part], result); + + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = copycase(evalString(rightSide), variables[val]); + } +} + +void +Postchunk::processCallMacro(xmlNode *localroot) +{ + const char *n = (const char *) localroot->properties->children->content; + int npar = 0; + + xmlNode *macro = macro_map[macros[n]]; + + for(xmlAttr *i = macro->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "npar")) + { + npar = atoi((const char *) i->children->content); + break; + } + } + + if (npar <= 0) + { + throw "Postchunk::processCallMacro() assumes npar > 0, but got npar <= 0"; + } + + InterchunkWord **myword = NULL; + if(npar > 0) + { + myword = new InterchunkWord *[npar+1]; + } + string **myblank = NULL; + if(npar > 0) + { + myblank = new string *[npar]; + } + + myword[0] = word[0]; + + int idx = 1; + int lastpos = 0; + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + int pos = atoi((const char *) i->properties->children->content); + if(!checkIndex(localroot, pos, lword)) { + pos=1; // for a rule to match, there has to be at least one word, so should be safe + } + myword[idx] = word[pos]; + if(blank) + { + myblank[idx-1] = blank[lastpos]; + } + + idx++; + lastpos = pos; + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + for(xmlNode *i = macro->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + processInstruction(i); + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + delete[] myword; + delete[] myblank; +} + +void +Postchunk::processChoose(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "when")) + { + bool picked_option = false; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(j->name, (const xmlChar *) "test")) + { + if(!processTest(j)) + { + break; + } + else + { + picked_option = true; + } + } + else + { + processInstruction(j); + } + } + } + if(picked_option) + { + return; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + processInstruction(j); + } + } + } + } + } +} + +bool +Postchunk::processLogical(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) + { + return processEqual(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) + { + return processBeginsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) + { + return processBeginsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) + { + return processEndsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) + { + return processEndsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) + { + return processContainsSubstring(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + { + return processOr(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + { + return processAnd(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) + { + return processNot(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) + { + return processIn(localroot); + } + + return false; +} + +bool +Postchunk::processIn(xmlNode *localroot) +{ + xmlNode *value = NULL; + xmlChar *idlist = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(value == NULL) + { + value = i; + } + else + { + idlist = i->properties->children->content; + break; + } + } + } + + string sval = evalString(value); + + if(localroot->properties != NULL) + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + set &myset = listslow[(const char *) idlist]; + if(myset.find(tolower(sval)) != myset.end()) + { + return true; + } + else + { + return false; + } + } + } + + set &myset = lists[(const char *) idlist]; + if(myset.find(sval) != myset.end()) + { + return true; + } + else + { + return false; + } +} + +bool +Postchunk::processTest(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return processLogical(i); + } + } + return false; +} + +bool +Postchunk::processAnd(xmlNode *localroot) +{ + bool val = true; + for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val && processLogical(i); + } + } + + return val; +} + +bool +Postchunk::processOr(xmlNode *localroot) +{ + bool val = false; + for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val || processLogical(i); + } + } + + return val; +} + +bool +Postchunk::processNot(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return !processLogical(i); + } + } + return false; +} + +bool +Postchunk::processEqual(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first) == evalString(second); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)) == tolower(evalString(second)); + } + else + { + return evalString(first) == evalString(second); + } + } +} + +bool +Postchunk::beginsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = 0; i != limit; i++) + { + if(s1[i] != s2[i]) + { + return false; + } + } + + return true; +} + +bool +Postchunk::endsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) + { + if(s1[j] != s2[i]) + { + return false; + } + } + + return true; +} + + +bool +Postchunk::processBeginsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return beginsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return beginsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return beginsWith(evalString(first), evalString(second)); + } + } +} + +bool +Postchunk::processEndsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return endsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return endsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return endsWith(evalString(first), evalString(second)); + } + } +} + +bool +Postchunk::processBeginsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(beginsWith(needle, *it)) + { + return true; + } + } + return false; +} + +bool +Postchunk::processEndsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(endsWith(needle, *it)) + { + return true; + } + } + return false; +} + + +bool +Postchunk::processContainsSubstring(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first).find(evalString(second)) != string::npos; + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; + } + else + { + return evalString(first).find(evalString(second)) != string::npos; + } + } +} + +string +Postchunk::copycase(string const &source_word, string const &target_word) +{ + wstring result; + wstring const s_word = UtfConverter::fromUtf8(source_word); + wstring const t_word = UtfConverter::fromUtf8(target_word); + + bool firstupper = iswupper(s_word[0]); + bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); + bool sizeone = s_word.size() == 1; + + if(!uppercase || (sizeone && uppercase)) + { + result = StringUtils::tolower(t_word); + } + else + { + result = StringUtils::toupper(t_word); + } + + if(firstupper) + { + result[0] = towupper(result[0]); + } + + return UtfConverter::toUtf8(result); +} + +string +Postchunk::caseOf(string const &str) +{ + wstring const s = UtfConverter::fromUtf8(str); + + if(s.size() > 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else if(!iswupper(s[s.size()-1])) + { + return "Aa"; + } + else + { + return "AA"; + } + } + else if(s.size() == 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else + { + return "Aa"; + } + } + else + { + return "aa"; + } +} + +wstring +Postchunk::caseOf(wstring const &str) +{ + if(str.size() > 1) + { + if(!iswupper(str[0])) + { + return L"aa"; + } + else if(!iswupper(str[str.size()-1])) + { + return L"Aa"; + } + else + { + return L"AA"; + } + } + else if(str.size() == 1) + { + if(!iswupper(str[0])) + { + return L"aa"; + } + else + { + return L"Aa"; + } + } + else + { + return L"aa"; + } +} + +string +Postchunk::tolower(string const &str) const +{ + return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); +} + +string +Postchunk::tags(string const &str) const +{ + string result = "<"; + + for(unsigned int i = 0, limit = str.size(); i != limit; i++) + { + if(str[i] == '.') + { + result.append("><"); + } + else + { + result += str[i]; + } + } + + result += '>'; + + return result; +} + +void +Postchunk::processRule(xmlNode *localroot) +{ + // localroot is suposed to be an 'action' tag + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + processInstruction(i); + } + } +} + +TransferToken & +Postchunk::readToken(FILE *in) +{ + if(!input_buffer.isEmpty()) + { + return input_buffer.next(); + } + + wstring content; + while(true) + { + int val = fgetwc_unlocked(in); + if(feof(in) || (internal_null_flush && val == 0)) + { + return input_buffer.add(TransferToken(content, tt_eof)); + } + if(val == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val == L'[') + { + content += L'['; + while(true) + { + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L']') + { + content += L']'; + break; + } + else + { + content += wchar_t(val2); + } + } + } + else if(inword && val == L'{') + { + content += L'{'; + while(true) + { + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L'}') + { + int val3 = wchar_t(fgetwc_unlocked(in)); + ungetwc(val3, in); + + content += L'}'; + if(val3 == L'$') + { + break; + } + } + else + { + content += wchar_t(val2); + } + } + } + else if(inword && val == L'$') + { + inword = false; + return input_buffer.add(TransferToken(content, tt_word)); + } + else if(val == L'^') + { + inword = true; + return input_buffer.add(TransferToken(content, tt_blank)); + } + else + { + content += wchar_t(val); + } + } +} + +bool +Postchunk::getNullFlush(void) +{ + return null_flush; +} + +void +Postchunk::setNullFlush(bool null_flush) +{ + this->null_flush = null_flush; +} + +void +Postchunk::postchunk_wrapper_null_flush(FILE *in, FILE *out) +{ + null_flush = false; + internal_null_flush = true; + + while(!feof(in)) + { + postchunk(in, out); + fputwc_unlocked(L'\0', out); + int code = fflush(out); + if(code != 0) + { + wcerr << L"Could not flush output " << errno << endl; + } + } + + internal_null_flush = false; + null_flush = true; +} + +void +Postchunk::postchunk(FILE *in, FILE *out) +{ + if(getNullFlush()) + { + postchunk_wrapper_null_flush(in, out); + } + + int last = 0; + + output = out; + ms.init(me->getInitial()); + + while(true) + { + if(ms.size() == 0) + { + if(lastrule != NULL) + { + applyRule(); + input_buffer.setPos(last); + } + else + { + if(tmpword.size() != 0) + { + unchunk(*tmpword[0], output); + tmpword.clear(); + input_buffer.setPos(last); + input_buffer.next(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + else if(tmpblank.size() != 0) + { + fputws_unlocked(tmpblank[0]->c_str(), output); + tmpblank.clear(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + } + } + int val = ms.classifyFinals(me->getFinals()); + if(val != -1) + { + lastrule = rule_map[val-1]; + last = input_buffer.getPos(); + } + + TransferToken ¤t = readToken(in); + + switch(current.getType()) + { + case tt_word: + applyWord(current.getContent()); + tmpword.push_back(¤t.getContent()); + break; + + case tt_blank: + ms.step(L' '); + tmpblank.push_back(¤t.getContent()); + break; + + case tt_eof: + if(tmpword.size() != 0) + { + tmpblank.push_back(¤t.getContent()); + ms.clear(); + } + else + { + fputws_unlocked(current.getContent().c_str(), output); + return; + } + break; + + default: + cerr << "Error: Unknown input token." << endl; + return; + } + } +} + +void +Postchunk::applyRule() +{ + wstring const chunk = *tmpword[0]; + tmpword.clear(); + splitWordsAndBlanks(chunk, tmpword, tmpblank); + + word = new InterchunkWord *[tmpword.size()+1]; + lword = tmpword.size(); + word[0] = new InterchunkWord(UtfConverter::toUtf8(wordzero(chunk))); + + for(unsigned int i = 1, limit = tmpword.size()+1; i != limit; i++) + { + if(i == 1) + { + if(limit != 2) + { + blank = new string *[limit - 2]; + lblank = limit - 3; + } + else + { + blank = NULL; + lblank = 0; + } + } + else + { + blank[i-2] = new string(UtfConverter::toUtf8(*tmpblank[i-1])); + } + + word[i] = new InterchunkWord(UtfConverter::toUtf8(*tmpword[i-1])); + } + + processRule(lastrule); + lastrule = NULL; + + if(word) + { + for(unsigned int i = 0, limit = tmpword.size() + 1; i != limit; i++) + { + delete word[i]; + } + delete[] word; + } + if(blank) + { + for(unsigned int i = 0, limit = tmpword.size() - 1; i != limit; i++) + { + delete blank[i]; + } + delete[] blank; + } + word = NULL; + blank = NULL; + + for(unsigned int i = 0, limit = tmpword.size(); i != limit; i++) + { + if(i != 0) + { + delete tmpblank[i]; + } + delete tmpword[i]; + } + tmpword.clear(); + tmpblank.clear(); + ms.init(me->getInitial()); +} + +void +Postchunk::applyWord(wstring const &word_str) +{ + ms.step(L'^'); + for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) + { + switch(word_str[i]) + { + case L'\\': + i++; + ms.step(towlower(word_str[i]), any_char); + break; + + case L'<': +/* for(unsigned int j = i+1; j != limit; j++) + { + if(word_str[j] == '>') + { + int symbol = alphabet(word_str.substr(i, j-i+1)); + if(symbol) + { + ms.step(symbol, any_tag); + } + else + { + ms.step(any_tag); + } + i = j; + break; + } + } + break;*/ + + case L'{': // ignore the unmodifiable part of the chunk + ms.step(L'$'); + return; + + default: + ms.step(towlower(word_str[i]), any_char); + break; + } + } + ms.step(L'$'); +} + +vector +Postchunk::getVecTags(wstring const &chunk) +{ + vector vectags; + + for(int i = 0, limit = chunk.size(); i != limit; i++) + { + if(chunk[i] == L'\\') + { + i++; + } + else if(chunk[i] == L'<') + { + wstring mytag; + do + { + mytag += chunk[i++]; + } + while(chunk[i] != L'>'); + vectags.push_back(mytag + L'>'); + } + else if(chunk[i] == L'{') + { + break; + } + } + return vectags; +} + +int +Postchunk::beginChunk(wstring const &chunk) +{ + for(int i = 0, limit = chunk.size(); i != limit; i++) + { + if(chunk[i] == L'\\') + { + i++; + } + else if(chunk[i] == L'{') + { + return i + 1; + } + } + return chunk.size(); +} + +int +Postchunk::endChunk(wstring const &chunk) +{ + return chunk.size()-2; +} + +wstring +Postchunk::wordzero(wstring const &chunk) +{ + for(unsigned int i = 0, limit = chunk.size(); i != limit ;i++) + { + if(chunk[i] == L'\\') + { + i++; + } + else if(chunk[i] == L'{') + { + return chunk.substr(0, i); + } + } + + return L""; +} + +wstring +Postchunk::pseudolemma(wstring const &chunk) +{ + for(unsigned int i = 0, limit = chunk.size(); i != limit ;i++) + { + if(chunk[i] == L'\\') + { + i++; + } + else if(chunk[i] == L'<' || chunk[i] == L'{') + { + return chunk.substr(0, i); + } + } + + return L""; +} + +void +Postchunk::unchunk(wstring const &chunk, FILE *output) +{ + vector vectags = getVecTags(chunk); + wstring case_info = caseOf(pseudolemma(chunk)); + bool uppercase_all = false; + bool uppercase_first = false; + + if(case_info == L"AA") + { + uppercase_all = true; + } + else if(case_info == L"Aa") + { + uppercase_first = true; + } + + for(int i = beginChunk(chunk), limit = endChunk(chunk); i < limit; i++) + { + if(chunk[i] == L'\\') + { + fputwc_unlocked(L'\\', output); + fputwc_unlocked(chunk[++i], output); + } + else if(chunk[i] == L'^') + { + fputwc_unlocked(L'^', output); + while(chunk[++i] != L'$') + { + if(chunk[i] == L'\\') + { + fputwc_unlocked(L'\\', output); + fputwc_unlocked(chunk[++i], output); + } + else if(chunk[i] == L'<') + { + if(iswdigit(chunk[i+1])) + { + // replace tag + unsigned long value = wcstoul(chunk.c_str()+i+1, + NULL, 0) - 1; + //atoi(chunk.c_str()+i+1)-1; + if(vectags.size() > value) + { + fputws_unlocked(vectags[value].c_str(), output); + } + while(chunk[++i] != L'>'); + } + else + { + fputwc_unlocked(L'<', output); + while(chunk[++i] != L'>') fputwc_unlocked(chunk[i], output); + fputwc_unlocked(L'>', output); + } + } + else + { + if(uppercase_all) + { + fputwc_unlocked(towupper(chunk[i]), output); + } + else if(uppercase_first) + { + if(iswalnum(chunk[i])) + { + fputwc_unlocked(towupper(chunk[i]), output); + uppercase_first = false; + } + else + { + fputwc_unlocked(chunk[i], output); + } + } + else + { + fputwc_unlocked(chunk[i], output); + } + } + } + fputwc_unlocked(L'$', output); + } + else if(chunk[i] == L'[') + { + fputwc_unlocked(L'[', output); + while(chunk[++i] != L']') + { + if(chunk[i] == L'\\') + { + fputwc_unlocked(L'\\', output); + fputwc_unlocked(chunk[++i], output); + } + else + { + fputwc_unlocked(chunk[i], output); + } + } + fputwc_unlocked(L']', output); + } + else + { + fputwc_unlocked(chunk[i], output); + } + } +} + + +void +Postchunk::splitWordsAndBlanks(wstring const &chunk, vector &words, + vector &blanks) +{ + vector vectags = getVecTags(chunk); + wstring case_info = caseOf(pseudolemma(chunk)); + bool uppercase_all = false; + bool uppercase_first = false; + bool lastblank = true; + + if(case_info == L"AA") + { + uppercase_all = true; + } + else if(case_info == L"Aa") + { + uppercase_first = true; + } + + for(int i = beginChunk(chunk), limit = endChunk(chunk); i < limit; i++) + { + if(chunk[i] == L'^') + { + if(!lastblank) + { + blanks.push_back(new wstring(L"")); + } + lastblank = false; + wstring *myword = new wstring(); + wstring &ref = *myword; + + while(chunk[++i] != L'$') + { + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else if(chunk[i] == L'<') + { + if(iswdigit(chunk[i+1])) + { + // replace tag + unsigned long value = wcstoul(chunk.c_str()+i+1, + NULL, 0) - 1; + if(vectags.size() > value) + { + ref.append(vectags[value]); + } + while(chunk[++i] != L'>'); + } + else + { + ref += L'<'; + while(chunk[++i] != L'>') ref += chunk[i]; + ref += L'>'; + } + } + else + { + if(uppercase_all) + { + ref += towupper(chunk[i]); + } + else if(uppercase_first) + { + if(iswalnum(chunk[i])) + { + ref += towupper(chunk[i]); + uppercase_first = false; + } + else + { + ref += chunk[i]; + } + } + else + { + ref += chunk[i]; + } + } + } + + words.push_back(myword); + } + else if(chunk[i] == L'[') + { + if (!(lastblank && blanks.back())) + { + blanks.push_back(new wstring()); + } + wstring &ref = *(blanks.back()); + ref += L'['; + while(chunk[++i] != L']') + { + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else + { + ref += chunk[i]; + } + } + ref += chunk[i]; + + lastblank = true; + } + else + { + if (!lastblank) + { + wstring *myblank = new wstring(L""); + blanks.push_back(myblank); + } + wstring &ref = *(blanks.back()); + if(chunk[i] == L'\\') + { + ref += L'\\'; + ref += chunk[++i]; + } + else + { + ref += chunk[i]; + } + lastblank = true; + } + } +} + Index: branches/apertium-tagger/apertium2/apertium/unlocked_cstdio.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/unlocked_cstdio.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/unlocked_cstdio.h (revision 69632) @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _APERTIUM_UNLOCKED_CSTDIO_ +#define _APERTIUM_UNLOCKED_CSTDIO_ + +#include + +#if !HAVE_DECL_FPUTS_UNLOCKED +#define fputs_unlocked fputs +#endif + +#if !HAVE_DECL_FGETC_UNLOCKED +#define fgetc_unlocked fgetc +#endif + +#if !HAVE_DECL_FPUTC_UNLOCKED +#define fputc_unlocked fputc +#endif + +#if !HAVE_DECL_FWRITE_UNLOCKED +#define fwrite_unlocked fwrite +#endif + +#if !HAVE_DECL_FREAD_UNLOCKED +#define fread_unlocked fread +#endif + +#if !HAVE_DECL_FGETWC_UNLOCKED +#define fgetwc_unlocked fgetwc +#endif + +#if !HAVE_DECL_FPUTWC_UNLOCKED +#define fputwc_unlocked fputwc +#endif + +#if !HAVE_DECL_FPUTWS_UNLOCKED +#define fputws_unlocked fputws +#endif + +#if !HAVE_MBTOWC +#include +inline int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); } +inline int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); } +#endif + +#endif Index: branches/apertium-tagger/apertium2/apertium/lextor.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/lextor.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lextor.cc (revision 69632) @@ -0,0 +1,1045 @@ +/* + * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante + * author: Felipe Sánchez-Martínez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include + +#include +#include +#include +#include + +using namespace Apertium; + + +#define PI 3.14159265358979323846264338327950288 + +bool LexTor::debug; +double LexTor::angleth; + +LexTor::LexTor() : +fstpbil(0) +{ + lextor_data=NULL; + tlmodel=NULL; +} + +LexTor::LexTor(const LexTor& lt) : +fstpbil(0) +{ + lextor_data=lt.lextor_data; + tlmodel=lt.tlmodel; +} + +LexTor::~LexTor() { +} + +void +LexTor::set_lextor_data(LexTorData* ltd) { + lextor_data=ltd; +} + +void +LexTor::set_tlmodel(LexTorData* tlm) { + tlmodel=tlm; +} + +void +LexTor::set_bildic(FSTProcessor *fstp) { + fstpbil=fstp; +} + +void +LexTor::trainwrd(wistream& is, int left, int right, double weigth_exponent) { + if (lextor_data==NULL) { + wcerr<ensure_stopwords_ok(); + + wcerr< words2workwith=lextor_data->get_words(); + set::iterator itword; + + map wordsum; + + wcerr< > context; + deque buffer; + unsigned word_index=(unsigned)left; + + unsigned buffer_max_size=(unsigned)(left+1+right); + + LexTorWord *ltword; + ltword=LexTorWord::next_word(is); + while(ltword!=NULL) { + if ((++nw%250000)==0) + wcerr<get_word_string() + <reduce(ltword->get_word_string())<reduce(ltword->get_word_string()); + + if (!lextor_data->is_stopword(reduced_word)) { + if (buffer.size()>=buffer_max_size) { + buffer.pop_front(); + } + buffer.push_back(reduced_word); + + wordsum[reduced_word]+=1.0; + + //The buffer is already full + if (buffer.size()==buffer_max_size) { + for(itword=words2workwith.begin(); itword!=words2workwith.end(); itword++) { + if (buffer[word_index]==(*itword)) { + if(debug) { + wcerr<>>>"<::iterator itws; + for(itws=wordsum.begin(); itws!=wordsum.end(); itws++) { + lextor_data->set_wordcount(itws->first,itws->second); + //if(debug) { + wcerr<first<second< > context_v; + map::iterator itm; + + while(context[*itword].size()>0) { + itm=context[*itword].begin(); + context_v.push_back(*itm); + context[*itword].erase(itm); + } + + sort(context_v.begin(), context_v.end(), comparer); + wstring w=*itword; + lextor_data->set_cooccurrence_context(w, context_v); + lextor_data->set_lexchoice_sum(w, wordsum[w]); + + //if (debug) { + wcerr<ensure_stopwords_ok(); + + wcerr< words2workwith=lextor_data->get_words(); + set::iterator itword; + + map wordsum; + map lechsum; + + wcerr< lexchoice_translation; + map > lexical_choices_of_word; + + wcerr< lexical_choices=lextor_data->get_lexical_choices(*itword); + lexical_choices_of_word[*itword]=lexical_choices; + set::iterator itlch; + for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) { + lexchoice_translation[*itlch]=tlwordmodel.reduce(bildic.biltrans(*itlch,false)); + wcerr<<*itlch< > context; + deque buffer; + + int word_index=left; + unsigned buffer_max_size=left+right+1; + + LexTorWord *ltword; + ltword=LexTorWord::next_word(is,&dic); + while(ltword!=NULL) { + if (debug) { + wcerr<get_word_string()<reduce(ltword->get_word_string()); + getchar(); + } + if ((++nw%250000)==0) + wcerr<reduce(ltword->get_word_string()); + + if (!lextor_data->is_stopword(reduced_word)) { + if (buffer.size()>=buffer_max_size) { + buffer.pop_front(); + } + buffer.push_back(*ltword); + + wordsum[reduced_word]+=1.0; + + //The buffer is already full + if (buffer.size()==buffer_max_size) { + + wstring reduced_buffer_word=lextor_data->reduce(buffer[word_index].get_word_string()); + + for(itword=words2workwith.begin(); itword!=words2workwith.end(); itword++) { + if (reduced_buffer_word==(*itword)) { + //We translate each word in the context + //Note: Words in the context can also be ambiguous (with more than one lexical choice) + //In that case the count will come from all the possible + //translations + vector > translation_buffer(buffer_max_size); + vector reduced_buffer(buffer_max_size); + + for (int i=0; i<(int)buffer_max_size; i++) { + reduced_buffer[i]=lextor_data->reduce(buffer[i].get_word_string()); + } + + if(debug) { + wcerr<>>>"<0) { + wstring tr=tlwordmodel.reduce(aux_tr); + translation_buffer[i].push_back(tr); + str_translations+=tr+L"/"; + } else { + wcerr<>>>"< lexical_choices=lexical_choices_of_word[*itword]; + set::iterator itlch; + + map > local_context; + map sumvotes_context; + + //For each lexical choice the counts from the TL are collected + for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) { + for (int i=0; i<(int)buffer_max_size; i++) { + if ((i!=word_index)&&(reduced_buffer[i]!=(*itword))) { + COUNT_DATA_TYPE target_vote=0; + + //The counts of the TL co-occurrence model are transferred to the SL. If the SL word is ambiguous + //it will have more than one translation into TL, so we need to normalize using the frequency of words + //in the TL + vector translation_weighs(translation_buffer[i].size()); + double sum=0.0; + if (translation_buffer[i].size()>1) { + for(int j=0; j<(int)translation_buffer[i].size(); j++) { + translation_weighs[j]=tlwordmodel.get_lexchoice_sum(translation_buffer[i][j]); + sum+=translation_weighs[j]; + + //!!!!! Para hacer que no tenga en cuenta las polisemicas del contexto + ///////translation_weighs[j]=0; + //!!!!! + + if (debug) { + wcerr<0) { + aux_vote=(tlwordmodel.vote_from_word(lexchoice_translation[*itlch],translation_buffer[i][j])/ + tlwordmodel.get_wordcount(lexchoice_translation[*itlch]))*translation_weighs[j]; + if (debug) { + wcerr<0) { + wcerr<0) { + local_context[*itlch][reduced_buffer[i]]+=target_vote; + sumvotes_context[reduced_buffer[i]]+=target_vote; + } + } + } + } + + if (debug) { + wcerr< local_lexsum; + double local_lexsumsum=0.0; + for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) { + int distance=(-1)*left; + for (int i=0; i<(int)buffer_max_size; i++) { + if ((i!=word_index)&&(reduced_buffer[i]!=(*itword))) { + if (local_context[*itlch][reduced_buffer[i]]>0) { + double cc=local_context[*itlch][reduced_buffer[i]]/sumvotes_context[reduced_buffer[i]]; + double count_to_apply=cc/pow(fabs((double)distance),weigth_exponent); + context[*itlch][reduced_buffer[i]]+=count_to_apply; + if (debug) { + wcerr<0) && (local_lexsumsum>0)) + lechsum[*itlch]+=local_lexsum[*itlch]/local_lexsumsum; + if (debug) { + wcerr<::iterator itws; + for(itws=wordsum.begin(); itws!=wordsum.end(); itws++) { + lextor_data->set_wordcount(itws->first,itws->second); + //if(debug) { + wcerr<first<second< lexical_choices=lexical_choices_of_word[*itword]; + set::iterator itlch; + for(itlch=lexical_choices.begin(); itlch!=lexical_choices.end(); itlch++) { + PairStringCountComparer comparer; + vector > context_v; + map::iterator itm; + + while(context[*itlch].size()>0) { + itm=context[*itlch].begin(); + //wcerr<first<second<set_cooccurrence_context(lch, context_v); + //lextor_data->set_lexchoice_sum(lch, tlwordmodel.get_lexchoice_sum(lexchoice_translation[lch])); + + //wcerr<::iterator itlcs; + for(itlcs=lechsum.begin(); itlcs!=lechsum.end(); itlcs++) { + lextor_data->set_lexchoice_sum(itlcs->first,itlcs->second); + //if(debug) { + wcerr<first<second< buffer; + deque window; + + LexTorWord nullword(L"NULLWORD", &fstp); + + for(int i=0; i<(left+right+1); i++) + window.push_back(nullword); + + int retain=0; + + LexTorWord* ltword=NULL; + ltword=LexTorWord::next_word(is, &fstp); + + while(ltword) { + //wcerr<get_word_string() + //<reduce(ltword->get_word_string())<n_lexical_choices()<is_stopword(lextor_data->reduce(ltword->get_word_string()))) { + if (window.size()>=(unsigned)(left+1+right)) + window.pop_front(); + + window.push_back(*ltword); + + if (ltword->n_lexical_choices()>1) { + retain++; + if (retain>1) + buffer.push_back(*ltword); + } else { + if (retain>0) + buffer.push_back(*ltword); + else { + wcout<get_lexical_choice(-1,true); + if (lteval) + lteval->evalword(*ltword, -1, lextor_data); + } + } + + if (window[left].n_lexical_choices()>1) { + + if (debug) { + wcerr<>>>"<evalword(window[left], winner, lextor_data); + + //For debug + /* + cout<0) + cout<0) { + while ((buffer.size()>0)&&(buffer[0].n_lexical_choices()==1)) { + wcout<evalword(buffer[0], -1, lextor_data); + buffer.pop_front(); + } + if ((buffer.size()>0)&&(buffer[0].n_lexical_choices()>1)) + buffer.pop_front(); + + retain--; + } + } + } else { //It's a stopword + if (retain>0) + buffer.push_back(*ltword); + else { + wcout<get_lexical_choice(-1,true); + if (lteval) + lteval->evalword(*ltword, -1, lextor_data); + } + } + + delete ltword; + ltword=LexTorWord::next_word(is, &fstp); + } + + if (retain>0) { + for(unsigned i=left+1; i1) { + int winner=estimate_winner_lch(window, i, weigth_exponent); + + wcout<evalword(window[i], winner, lextor_data); + + //For debug + /* + cout<0) + cout<0) { + while ((buffer.size()>0)&&(buffer[0].n_lexical_choices()==1)) { + wcout<evalword(buffer[0], -1, lextor_data); + buffer.pop_front(); + } + if ((buffer.size()>0)&&(buffer[0].n_lexical_choices()>1)) + buffer.pop_front(); + + retain--; + } + + } + } + } + + //wcerr<& window, int word_index, double weigth_exponent) { + //return estimate_winner_lch_cosine(window, word_index, weigth_exponent); + return estimate_winner_lch_voting(window, word_index, weigth_exponent); + //return estimate_winner_lch_mostprob(window, word_index, weigth_exponent); + //return estimate_winner_lch_votingtl(window, word_index, weigth_exponent); + //return -1; +} + +int +LexTor::estimate_winner_lch_voting(deque& window, int word_index, double weigth_exponent) { + vector lexchoices_count(window[word_index].n_lexical_choices()); + + if (debug) { + wcerr<>>>"<reduce(window[i].get_word_string())<reduce(window[i].get_word_string())<get_lexchoice_sum(lextor_data->reduce_lexical_choice(window[word_index].get_lexical_choice(i,false))); + sum_lexchoices+=aux_lexchoice_sum; + if (debug) { + wcerr<reduce_lexical_choice(window[word_index].get_lexical_choice(i,false))<get_wordcount(lextor_data->reduce(window[word_index].get_word_string())); + if (debug) { + wcerr<reduce(window[word_index].get_word_string())<reduce_lexical_choice(window[word_index].get_lexical_choice(i,false)); + if (debug) { + wcerr<reduce(window[j].get_word_string()); + + if (lextor_data->get_wordcount(reduced_word)>0) { + vote=lextor_data->vote_from_word(reduced_lexchoice, reduced_word)/ + (((lextor_data->get_lexchoice_sum(reduced_lexchoice))/sum_lexchoices)*wordcount); + + lexchoices_count[i]+=vote/pow(fabs((double)distance),weigth_exponent); + } + + if (debug) { + wcerr<vote_from_word(reduced_lexchoice, reduced_word)<get_wordcount(reduced_word)<0) && (lexchoices_count[i]>winner_vote)) { + winner_vote=lexchoices_count[i]; + winner=i; + } + /* + else if ((lexchoices_count[i]>0) && (lexchoices_count[i]==winner_vote)) { + //Take the most probable one, the one with the highest sum + COUNT_DATA_TYPE sum_i=lextor_data->get_lexchoice_sum(lextor_data->reduce(window[word_index].get_lexical_choice(i))); + COUNT_DATA_TYPE sum_win=lextor_data->get_lexchoice_sum(lextor_data->reduce(window[word_index].get_lexical_choice(winner))); + if (sum_i>sum_win) + winner=i; + } + */ + } + + if (debug) { + wcerr<& window, int word_index, double weigth_exponent) { + int winner=-1; + double greatest_sum=-1; + for(int i=0; ireduce_lexical_choice(window[word_index].get_lexical_choice(i,false)); + double sumlch=lextor_data->get_lexchoice_sum(reduced_lexchoice); + + + if (debug) { + wcerr<greatest_sum) { + greatest_sum=sumlch; + winner=i; + } + } + + if (greatest_sum==0) + winner=-1; + + if (debug) + wcerr<& window, int word_index, double weigth_exponent) { + map vcontext; + + int distance=(-1)*(word_index); + for(int i=0; i<(int)window.size(); i++) { + if (i!=word_index) { + wstring reduced_word=lextor_data->reduce(window[i].get_word_string()); + vcontext[reduced_word]+=1.0/pow(fabs((double)distance),weigth_exponent); + } + distance++; + } + + if (debug) { + wcerr<::iterator it; + for(it=vcontext.begin(); it!=vcontext.end(); it++) + wcerr<first<second<reduce_lexical_choice(window[word_index].get_lexical_choice(i,false)); + + double aux_cosine=cosine(vcontext, reduced_lexchoice); + double aux_angle=(acos(aux_cosine)*180)/PI; + if (debug) { + wcerr<reduce(window[word_index].get_word_string())<max_cosine) { + diff_angle=abs(min_angle-aux_angle); + winner=i; + max_cosine=aux_cosine; + min_angle=aux_angle; + } + */ + } + + if (debug) { + wcerr<& window, int word_index, double weigth_exponent) { + if (tlmodel==NULL) { + wcerr< lexchoices_count(window[word_index].n_lexical_choices()); + vector > translation_window (window.size()); + vector reduced_window(window.size()); + + for (unsigned i=0; ireduce(window[i].get_word_string()); + + if(debug) { + wcerr<>>>"<reduce(window[i].translate(*fstpbil,j)); + translation_window[i].push_back(tr); + str_translations+=tr+L"/"; + } + if (debug) { + if (i==(unsigned)word_index) + wcerr<>>>"< translation_weighs(translation_window[k].size()); + double sum=0.0; + if (translation_window[k].size()>1) { + for(unsigned j=0; jget_lexchoice_sum(translation_window[k][j]); + sum+=translation_weighs[j]; + + //!!!!! Para hacer que no tenga en cuenta las + //!!!!! polisemicas del contexto + ///////translation_weighs[j]=0; + //!!!!! + //!!!!! + + if (debug) { + wcerr<vote_from_word(translation_window[word_index][i],translation_window[k][j])<get_wordcount(translation_window[k][j])<get_wordcount(translation_window[k][j])>0) { + aux_vote=(tlmodel->vote_from_word(translation_window[word_index][i],translation_window[k][j])/ + tlmodel->get_wordcount(translation_window[k][j]))*translation_weighs[j]; + } + target_vote+=aux_vote; + + if(debug) { + wcerr<0) && (lexchoices_count[i]>winner_vote)) { + winner_vote=lexchoices_count[i]; + winner=i; + } + } + + if (debug) + wcerr<& vcontext, const wstring& reduced_lexchoice) { + map::iterator itc; + + //We calculate the scalar product between vcontext and the lexchoice vector + double scalar_product=0; + for(itc=vcontext.begin(); itc!=vcontext.end(); itc++) { + scalar_product+=(itc->second)*(lextor_data->vote_from_word(reduced_lexchoice, itc->first)); + } + + //We calculate the module of vcontext, ||vcontext|| + double module_vcontext=0; + for(itc=vcontext.begin(); itc!=vcontext.end(); itc++) { + module_vcontext+=(itc->second)*(itc->second); + } + module_vcontext=sqrt(module_vcontext); + + //We get the module of the lexchoice vector, ||lexchoice vector|| + double module_lexchoice_vector=lextor_data->get_module_lexchoice_vector(reduced_lexchoice); + + if (module_vcontext==0) { + wcerr<::iterator it; + for(it=vcontext.begin(); it!=vcontext.end(); it++) + wcerr<first<second< +#include +#include +#include +#include +#include + +extern "C" { +#if !defined(__STDC__) +# define __STDC__ 1 +#endif +#include +} + +#include +#include +#include +#ifndef GENFORMAT +#include "apertium_config.h" +#endif +#include +#ifdef _WIN32 +#include +#include +#endif + +using namespace std; + +AccentsMap accentsMap(false); +wstring closesym = L""; +string memconv = ""; +//For german babel detection +bool ngermanbabel = false; + +wstring convertir(string const &multibyte, int const length) +{ + memconv.append(multibyte.c_str(), length); + int tam = memconv.size(); + wchar_t *retval = new wchar_t[tam+1]; + size_t l = mbstowcs(retval, memconv.c_str(), tam); + + if(l == ((size_t) -1)) + { + delete[] retval; + if(memconv.size() >= 4) + { + wcerr << L"Warning: wrong encoding" << endl; + } + return L""; + } + else + { + memconv = ""; + retval[l] = 0; + wstring ret = retval; + delete[] retval; + return ret; + } +} + + + + +%} + + +%option nounput +%option noyywrap +%option stack + +%x mathenv +%x readbrackets + +%% + + + + + +\\t\{..\} { //This information is lost + fputws(convertir(yytext+3,yyleng-4).c_str(),yyout); +} +\\l { + fputws(L"Ƃ", yyout); +} + +\"[oOaAuUsS] { //When usepackage[ngerman]{babel} is present (not checked). + if(!ngermanbabel) + fputws(convertir(yytext,yyleng).c_str(),yyout); + else { + switch(yytext[1]){ + case 'o': fputws(L"ö", yyout); break; + case 'O': fputws(L"Ö", yyout); break; + case 'a': fputws(L"Ă€", yyout); break; + case 'A': fputws(L"Ä", yyout); break; + case 'u': fputws(L"ĂŒ", yyout); break; + case 'U': fputws(L"Ü", yyout); break; + case 's': fputws(L"ß", yyout); break; + case 'S': fputws(L"ß", yyout); break; + } + } +} + + + +\\[\^\"\'`]((\{\\[ij]\})|(\\[ij])) { + switch(yytext[1]){ + case '^': + if(yytext[4]=='i') + fputws(L"Ăź", yyout); + else + fputws(L"Ä”",yyout); + break; + case '\"': + if(yytext[4]=='i') + fputws(L"ĂŻ",yyout); + else + fputws(L"j",yyout); //should actually be j with umlaut + break; + case '\'': + if(yytext[4]=='i') + fputws(L"Ă­",yyout); + else + fputws(L"j",yyout); //should actually be j with accent + break; + case '`': + if(yytext[4]=='i') + fputws(L"ĂŹ",yyout); + else + fputws(L"k",yyout); //should actually be j with accent + break; + } +} + +\{\\oe\} { + fputws(L"Ɠ",yyout); +} + +\{\\OE\} { + fputws(L"ƒ",yyout); +} + +\{\\ae\} { + fputws(L"ĂŠ",yyout); +} + +\{\\AE\} { + fputws(L"Æ",yyout); +} + +\{\\aa\} { + fputws(L"Ă„",yyout); +} + +\{\\AA\} { + fputws(L"Å",yyout); +} + +\{\\o\} { + fputws(L"Ăž",yyout); +} + +\{\\O\} { + fputws(L"Ø",yyout); +} + +\{\\ss\} { + fputws(L"ß",yyout); +} + +\\#[0-9]+ { + fputws((wstring(L"")).c_str(),yyout); +} + +\\# { + fputws(L"", yyout); +} + +\\[`'\^\"H~ck=b.druv]((\{.\})|(.)) { + wstring ws = convertir(yytext,yyleng).c_str(); + + wstring result = accentsMap.get( + L""+ws.substr(1,1)+ ( + (yyleng==3)? ws.substr(2,1) : ws.substr(3,1) + )); + + if(result == L"") + { + fputws((wstring(L"<")+convertir(yytext+1,yyleng)+wstring(L"/>")).c_str(),yyout); + } + else + { + fputws(result.c_str(), yyout); + } +} + +\\\\ { + fputws(L"
",yyout); +} + +\%.* { + if(yytext[yyleng-1]=='\r') + fputws((wstring(L"")+convertir(yytext+1,yyleng-2)+wstring(L"\r")).c_str(),yyout); + else + fputws((wstring(L"")+convertir(yytext+1,yyleng-1)+wstring(L"")).c_str(),yyout); +} + +\\usepackage\[[^\]]*\] { + wstring ws = convertir(yytext+12,yyleng-13); + fputws((wstring(L"")+ws+wstring(L"")).c_str(), yyout); + if(ws.find(L"ngerman") != wstring::npos) + ngermanbabel = true; +} + +\[[^\]]*\] { + fputws((wstring(L"")+convertir(yytext+1,yyleng-2)+wstring(L"")).c_str(), yyout); +} + +\\begin[^a-zA-Z0-9_] { + BEGIN(readbrackets); + closesym = L""; +} + +\\end[^a-zA-Z0-9_] { + BEGIN(readbrackets); + closesym = L"/"; +} + + + +[ \n\r\t]*\{?[ \n\r\t]* { + wstring ws = convertir(yytext,yyleng); + int i = ws.find(L'{'); //remove it + if(i>=0) + ws = ws.substr(0,i)+ws.substr(i+1); + fputws(ws.c_str(),yyout); +} + +[a-zA-Z0-9]+\* { + fputws((wstring(L"<")+closesym+convertir(yytext,yyleng-1)+wstring(L"_STAR>")).c_str(),yyout); +} + +[a-zA-Z0-9]+ { + fputws((wstring(L"<")+closesym+convertir(yytext,yyleng)+wstring(L">")).c_str(),yyout); +} + +[ \n\r\t]*\}[ \n\r\t]* { + BEGIN(0); + wstring ws = convertir(yytext,yyleng); + int i = ws.find(L'}'); //remove it + if(i>=0) + ws = ws.substr(0,i)+ws.substr(i+1); + fputws(ws.c_str(),yyout); +} + + +\\[A-Za-z]+\* { + fputws((wstring(L"<")+convertir(yytext+1,yyleng-2)+wstring(L"_STAR/>")).c_str(),yyout); +} + +\\[A-Za-z]+ { + fputws((wstring(L"<")+convertir(yytext+1,yyleng)+wstring(L"/>")).c_str(),yyout); +} + +\\\{ { + fputws(L"", yyout); + } + +\\\{ { + fputws(L"", yyout); + } + +\\\% { + fputws(L"", yyout); + } + +\{ { + fputws(L"",yyout); +} + +\} { + fputws((wstring(L"")).c_str(),yyout); +} + +~ { + fputws(L"&NBSP;",yyout); +} + +\$\$ { + BEGIN(mathenv); + fputws(L"",yyout); +} + +\$\$ { + fputws(L"",yyout); + BEGIN(0); +} + +\$ { + BEGIN(mathenv); + fputws(L"",yyout); +} + +\$ { + fputws(L"",yyout); + BEGIN(0); +} + +\\verb[|][^|]+[|] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[!][^!]+[!] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[?][^?]+[?] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[/][^/]+[/] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[#][^#]+[#] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\verb[+][^+]+[+] { + fputws(L"",yyout); + wstring ws = convertir(yytext, yyleng); + fputws(ws.substr(5, ws.size()-5).c_str(), yyout); + fputws(L"", yyout); +} + +\\\( { + fputws(L"",yyout); +} + +\\\) { + fputws(L"",yyout); +} + +\\\[ { + fputws(L"",yyout); +} + +\\\] { + fputws(L"",yyout); +} + +\?` { + fputws(L"Âż",yyout); +} + +!` { + fputws(L"ÂĄ",yyout); +} + +\" { + fputws(L""",yyout); +} +\' { + fputws(L"'",yyout); +} +\< { + fputws(L"<",yyout); +} +\> { + fputws(L">",yyout); +} +\\\& { + fputws(L"&",yyout); +} +\& { + fputws(L"",yyout); +} + + + + + +(.|\n|\r) { + fputws(convertir(yytext,yyleng).c_str(),yyout); +} + +(.|\n) { + fputws(convertir(yytext,yyleng).c_str(),yyout); +} + + +<> { + return 0; +} +%% + + + +void usage(string const &progname) +{ + + cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; + + cerr << "LaTeX format preprocessor " << endl; + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + size_t base = 0; + + if(argc >= 2 && !strcmp(argv[1],"-i")) + { + base++; + } + + if((argc-base) > 4) + { + usage(argv[0]); + } + + switch(argc-base) + { + case 3: + yyout = fopen(argv[2+base], "w"); + if(!yyout) + { + usage(argv[0]); + } + case 2: + yyin = fopen(argv[1+base], "r"); + if(!yyin) + { + usage(argv[0]); + } + break; + default: + break; + } + +#ifdef _WIN32 + _setmode(_fileno(yyin), _O_U8TEXT); + _setmode(_fileno(yyout), _O_U8TEXT); +#endif + // prevent warning message + yy_push_state(1); + yy_top_state(); + yy_pop_state(); + + yylex(); + + fclose(yyin); + fclose(yyout); +} Index: branches/apertium-tagger/apertium2/apertium/apertium_re.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_re.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_re.cc (revision 69632) @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include + +using namespace Apertium; +using namespace std; + +ApertiumRE::ApertiumRE() : +re(0) +{ + empty = true; +} + +ApertiumRE::~ApertiumRE() +{ + if(!empty) + { + pcre_free(re); + } + empty = true; +} + +void +ApertiumRE::read(FILE *input) +{ + unsigned int size = Compression::multibyte_read(input); + re = static_cast(pcre_malloc(size)); + if(size != fread(re, 1, size, input)) + { + wcerr << L"Error reading regexp" << endl; + exit(EXIT_FAILURE); + } + + empty = false; +} + +void +ApertiumRE::compile(string const &str) +{ + const char *error; + int erroroffset; + re = pcre_compile(str.c_str(), PCRE_DOTALL|PCRE_CASELESS|PCRE_EXTENDED|PCRE_UTF8, + &error, &erroroffset, NULL); + if(re == NULL) + { + wcerr << L"Error: pcre_compile "; + cerr << error << endl; + exit(EXIT_FAILURE); + } + + empty = false; +} + +void +ApertiumRE::write(FILE *output) const +{ + if(empty) + { + cerr << L"Error, cannot write empty regexp" << endl; + exit(EXIT_FAILURE); + } + + size_t size; + int rc = pcre_fullinfo(re, NULL, PCRE_INFO_SIZE, &size); + if(rc < 0) + { + wcerr << L"Error calling pcre_fullinfo()\n" << endl; + exit(EXIT_FAILURE); + } + + Compression::multibyte_write(size, output); + + size_t rc2 = fwrite(re, 1, size, output); + if(rc2 != size) + { + wcerr << L"Error writing precompiled regex\n" << endl; + exit(EXIT_FAILURE); + } +} + +string +ApertiumRE::match(string const &str) const +{ + if(empty) + { + return ""; + } + + int result[3]; + int workspace[4096]; +// int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); + int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); + + if(rc < 0) + { + switch(rc) + { + case PCRE_ERROR_NOMATCH: + return ""; + + default: + wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; + exit(EXIT_FAILURE); + } + } + + return str.substr(result[0], result[1]-result[0]); +} + +void +ApertiumRE::replace(string &str, string const &value) const +{ + if(empty) + { + return; + } + + int result[3]; + int workspace[4096]; + // int rc = pcre_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3); + int rc = pcre_dfa_exec(re, NULL, str.c_str(), str.size(), 0, PCRE_NO_UTF8_CHECK, result, 3, workspace, 4096); + if(rc < 0) + { + switch(rc) + { + case PCRE_ERROR_NOMATCH: + return; + + default: + wcerr << L"Error: Unknown error matching regexp (code " << rc << L")" << endl; + exit(EXIT_FAILURE); + } + } + + string res = str.substr(0, result[0]); + res.append(value); + res.append(str.substr(result[1])); + str = res; +} Index: branches/apertium-tagger/apertium2/apertium/interchunk.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk.cc (revision 69632) @@ -0,0 +1,1603 @@ +/* + * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "apertium_config.h" +#include + +using namespace Apertium; +using namespace std; + +void +Interchunk::destroy() +{ + delete me; + me = NULL; + + if(doc) + { + xmlFreeDoc(doc); + doc = NULL; + } +} + +Interchunk::Interchunk() : +word(0), +blank(0), +lword(0), +lblank(0), +output(0), +any_char(0), +any_tag(0), +nwords(0) +{ + me = NULL; + doc = NULL; + root_element = NULL; + lastrule = NULL; + inword = false; + null_flush = false; + internal_null_flush = false; + trace = false; + emptyblank = ""; +} + +Interchunk::~Interchunk() +{ + destroy(); +} + +void +Interchunk::readData(FILE *in) +{ + alphabet.read(in); + any_char = alphabet(TRXReader::ANY_CHAR); + any_tag = alphabet(TRXReader::ANY_TAG); + + Transducer t; + t.read(in, alphabet.size()); + + map finals; + + // finals + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + int key = Compression::multibyte_read(in); + finals[key] = Compression::multibyte_read(in); + } + + me = new MatchExe(t, finals); + + // attr_items + bool recompile_attrs = Compression::string_read(in) != string(pcre_version()); + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + attr_items[cad_k].read(in); + wstring fallback = Compression::wstring_read(in); + if(recompile_attrs) { + attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + } + } + + // variables + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + } + + // macros + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + macros[cad_k] = Compression::multibyte_read(in); + } + + // lists + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + + for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) + { + wstring const cad_v = Compression::wstring_read(in); + lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); + listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + } + } +} + +void +Interchunk::read(string const &transferfile, string const &datafile) +{ + readInterchunk(transferfile); + + // datafile + FILE *in = fopen(datafile.c_str(), "rb"); + if(!in) + { + cerr << "Error: Could not open file '" << datafile << "'." << endl; + exit(EXIT_FAILURE); + } + readData(in); + fclose(in); + +} + +void +Interchunk::readInterchunk(string const &in) +{ + doc = xmlReadFile(in.c_str(), NULL, 0); + + if(doc == NULL) + { + cerr << "Error: Could not parse file '" << in << "'." << endl; + exit(EXIT_FAILURE); + } + + root_element = xmlDocGetRootElement(doc); + + // search for macros & rules + for(xmlNode *i = root_element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "section-def-macros")) + { + collectMacros(i); + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "section-rules")) + { + collectRules(i); + } + } + } +} + +void +Interchunk::collectRules(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + for(xmlNode *j = i->children; ; j = j->next) + { + if(j->type == XML_ELEMENT_NODE && !xmlStrcmp(j->name, (const xmlChar *) "action")) + { + rule_map.push_back(j); + break; + } + } + } + } +} + +void +Interchunk::collectMacros(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + macro_map.push_back(i); + } + } +} + +bool +Interchunk::checkIndex(xmlNode *element, int index, int limit) +{ + if(index >= limit) + { + wcerr << L"Error in " << UtfConverter::fromUtf8((char *) doc->URL) <line << endl; + return false; + } + return true; +} + + +string +Interchunk::evalString(xmlNode *element) +{ + if (element == 0) + { + throw "Interchunk::evalString() was passed a NULL element"; + } + + map::iterator it; + it = evalStringCache.find(element); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_clip_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + if(ti.getContent() == "content") // jacob's new 'part' + { + string wf = word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + return wf.substr(1, wf.length()-2); // trim away the { and } + } + else + { + return word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]); + } + } + break; + + case ti_var: + return variables[ti.getContent()]; + + case ti_lit_tag: + case ti_lit: + return ti.getContent(); + + case ti_b: + if(checkIndex(element, ti.getPos(), lblank)) + { + if(ti.getPos() >= 0) + { + return !blank?"":*(blank[ti.getPos()]); + } + return " "; + } + break; + + case ti_get_case_from: + if(checkIndex(element, ti.getPos(), lword)) + { + return copycase(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()]), + evalString((xmlNode *) ti.getPointer())); + } + break; + + case ti_case_of_tl: + if(checkIndex(element, ti.getPos(), lword)) + { + return caseOf(word[ti.getPos()]->chunkPart(attr_items[ti.getContent()])); + } + break; + + default: + return ""; + } + return ""; + } + + if(!xmlStrcmp(element->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *)i->children->content) - 1; + } + } + + evalStringCache[element] = TransferInstr(ti_clip_tl, (const char *) part, pos, NULL); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag")) + { + evalStringCache[element] = TransferInstr(ti_lit_tag, + tags((const char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "lit")) + { + evalStringCache[element] = TransferInstr(ti_lit, ((const char *) element->properties->children->content), 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "b")) + { + if(element->properties == NULL) + { + evalStringCache[element] = TransferInstr(ti_b, " ", -1); + } + else + { + int pos = atoi((const char *) element->properties->children->content) - 1; + evalStringCache[element] = TransferInstr(ti_b, "", pos); + } + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "get-case-from")) + { + int pos = atoi((const char *) element->properties->children->content) - 1; + xmlNode *param = NULL; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + param = i; + break; + } + } + + evalStringCache[element] = TransferInstr(ti_get_case_from, "lem", pos, param); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "var")) + { + evalStringCache[element] = TransferInstr(ti_var, (const char *) element->properties->children->content, 0); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "case-of")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = element->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + } + + evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos); + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "concat")) + { + string value; + for(xmlNode *i = element->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + value.append(evalString(i)); + } + } + return value; + } + else if(!xmlStrcmp(element->name, (const xmlChar *) "chunk")) + { + return processChunk(element); + } + else + { + cerr << "Error: unexpected rvalue expression '" << element->name << "'" << endl; + exit(EXIT_FAILURE); + } + + return evalString(element); +} + +void +Interchunk::processOut(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "chunk")) + { + fputws_unlocked(UtfConverter::fromUtf8(processChunk(i)).c_str(), output); + } + else // 'b' + { + fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output); + } + } + } +} + +string +Interchunk::processChunk(xmlNode *localroot) +{ + string result; + result.append("^"); + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + result.append(evalString(i)); + } + } + + result.append("$"); + return result; +} + +void +Interchunk::processInstruction(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose")) + { + processChoose(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let")) + { + processLet(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "append")) + { + processAppend(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "out")) + { + processOut(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "call-macro")) + { + processCallMacro(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "modify-case")) + { + processModifyCase(localroot); + } +} + +void +Interchunk::processLet(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + map::iterator it = evalStringCache.find(leftSide); + if(it != evalStringCache.end()) + { + TransferInstr &ti = it->second; + switch(ti.getType()) + { + case ti_var: + variables[ti.getContent()] = evalString(rightSide); + return; + + case ti_clip_tl: + word[ti.getPos()]->setChunkPart(attr_items[ti.getContent()], evalString(rightSide)); + return; + + default: + return; + } + } + if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = evalString(rightSide); + evalStringCache[leftSide] = TransferInstr(ti_var, val, 0); + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + } + + + word[pos]->setChunkPart(attr_items[(const char *) part], + evalString(rightSide)); + evalStringCache[leftSide] = TransferInstr(ti_clip_tl, + (const char *) part, + pos, NULL); + } +} + +void +Interchunk::processAppend(xmlNode *localroot) +{ + string name; + for(xmlAttr *i = localroot->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "n")) + { + name = (char *) i->children->content; + break; + } + } + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + variables[name].append(evalString(i)); + } + } +} + +void +Interchunk::processModifyCase(xmlNode *localroot) +{ + xmlNode *leftSide = NULL, *rightSide = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(leftSide == NULL) + { + leftSide = i; + } + else + { + rightSide = i; + break; + } + } + } + + if(leftSide->name != NULL && !xmlStrcmp(leftSide->name, (const xmlChar *) "clip")) + { + int pos = 0; + xmlChar *part = NULL; + + for(xmlAttr *i = leftSide->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "part")) + { + part = i->children->content; + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "pos")) + { + pos = atoi((const char *) i->children->content) - 1; + } + } + + string const result = copycase(evalString(rightSide), + word[pos]->chunkPart(attr_items[(const char *) part])); + word[pos]->setChunkPart(attr_items[(const char *) part], result); + } + else if(!xmlStrcmp(leftSide->name, (const xmlChar *) "var")) + { + string const val = (const char *) leftSide->properties->children->content; + variables[val] = copycase(evalString(rightSide), variables[val]); + } +} + +void +Interchunk::processCallMacro(xmlNode *localroot) +{ + const char *n = (const char *) localroot->properties->children->content; + int npar = 0; + + xmlNode *macro = macro_map[macros[n]]; + + for(xmlAttr *i = macro->properties; i != NULL; i = i->next) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "npar")) + { + npar = atoi((const char *) i->children->content); + break; + } + } + + // ToDo: Is it at all valid if npar <= 0 ? + + InterchunkWord **myword = NULL; + if(npar > 0) + { + myword = new InterchunkWord *[npar]; + } + string **myblank = NULL; + if(npar > 0) + { + myblank = new string *[npar]; + myblank[npar-1] = &emptyblank; + } + + int idx = 0; + int lastpos = 0; + for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + int pos = atoi((const char *) i->properties->children->content)-1; + myword[idx] = word[pos]; + if(idx-1 >= 0) + { + myblank[idx-1] = blank[lastpos]; + } + idx++; + lastpos = pos; + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + for(xmlNode *i = macro->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + processInstruction(i); + } + } + + swap(myword, word); + swap(myblank, blank); + swap(npar, lword); + + delete[] myword; + delete[] myblank; +} + +void +Interchunk::processChoose(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(i->name, (const xmlChar *) "when")) + { + bool picked_option = false; + + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + if(!xmlStrcmp(j->name, (const xmlChar *) "test")) + { + if(!processTest(j)) + { + break; + } + else + { + picked_option = true; + } + } + else + { + processInstruction(j); + } + } + } + if(picked_option) + { + return; + } + } + else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise")) + { + for(xmlNode *j = i->children; j != NULL; j = j->next) + { + if(j->type == XML_ELEMENT_NODE) + { + processInstruction(j); + } + } + } + } + } +} + +bool +Interchunk::processLogical(xmlNode *localroot) +{ + if(!xmlStrcmp(localroot->name, (const xmlChar *) "equal")) + { + return processEqual(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with")) + { + return processBeginsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "begins-with-list")) + { + return processBeginsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with")) + { + return processEndsWith(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "ends-with-list")) + { + return processEndsWithList(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "contains-substring")) + { + return processContainsSubstring(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "or")) + { + return processOr(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "and")) + { + return processAnd(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not")) + { + return processNot(localroot); + } + else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in")) + { + return processIn(localroot); + } + + return false; +} + +bool +Interchunk::processIn(xmlNode *localroot) +{ + xmlNode *value = NULL; + xmlChar *idlist = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(value == NULL) + { + value = i; + } + else + { + idlist = i->properties->children->content; + break; + } + } + } + + string sval = evalString(value); + + if(localroot->properties != NULL) + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + set &myset = listslow[(const char *) idlist]; + if(myset.find(tolower(sval)) != myset.end()) + { + return true; + } + else + { + return false; + } + } + } + + set &myset = lists[(const char *) idlist]; + if(myset.find(sval) != myset.end()) + { + return true; + } + else + { + return false; + } +} + +bool +Interchunk::processTest(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return processLogical(i); + } + } + return false; +} + +bool +Interchunk::processAnd(xmlNode *localroot) +{ + bool val = true; + for(xmlNode *i = localroot->children; val && i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val && processLogical(i); + } + } + + return val; +} + +bool +Interchunk::processOr(xmlNode *localroot) +{ + bool val = false; + for(xmlNode *i = localroot->children; !val && i != NULL ; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + val = val || processLogical(i); + } + } + + return val; +} + +bool +Interchunk::processNot(xmlNode *localroot) +{ + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + return !processLogical(i); + } + } + return false; +} + +bool +Interchunk::processEqual(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first) == evalString(second); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)) == tolower(evalString(second)); + } + else + { + return evalString(first) == evalString(second); + } + } +} + +bool +Interchunk::beginsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = 0; i != limit; i++) + { + if(s1[i] != s2[i]) + { + return false; + } + } + + return true; +} + +bool +Interchunk::endsWith(string const &s1, string const &s2) const +{ + int const limit = s2.size(), constraint = s1.size(); + + if(constraint < limit) + { + return false; + } + for(int i = limit-1, j = constraint - 1; i >= 0; i--, j--) + { + if(s1[j] != s2[i]) + { + return false; + } + } + + return true; +} + + +bool +Interchunk::processBeginsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return beginsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return beginsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return beginsWith(evalString(first), evalString(second)); + } + } +} + +bool +Interchunk::processEndsWith(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return endsWith(evalString(first), evalString(second)); + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return endsWith(tolower(evalString(first)), tolower(evalString(second))); + } + else + { + return endsWith(evalString(first), evalString(second)); + } + } +} + +bool +Interchunk::processBeginsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(beginsWith(needle, *it)) + { + return true; + } + } + return false; +} + +bool +Interchunk::processEndsWithList(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + xmlChar *idlist = second->properties->children->content; + string needle = evalString(first); + set::iterator it, limit; + + if(localroot->properties == NULL || + xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes")) + { + it = lists[(const char *) idlist].begin(); + limit = lists[(const char *) idlist].end(); + } + else + { + needle = tolower(needle); + it = listslow[(const char *) idlist].begin(); + limit = listslow[(const char *) idlist].end(); + } + + for(; it != limit; it++) + { + if(endsWith(needle, *it)) + { + return true; + } + } + return false; +} + +bool +Interchunk::processContainsSubstring(xmlNode *localroot) +{ + xmlNode *first = NULL, *second = NULL; + + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + if(first == NULL) + { + first = i; + } + else + { + second = i; + break; + } + } + } + + if(localroot->properties == NULL) + { + return evalString(first).find(evalString(second)) != string::npos; + } + else + { + if(!xmlStrcmp(localroot->properties->children->content, + (const xmlChar *) "yes")) + { + return tolower(evalString(first)).find(tolower(evalString(second))) != string::npos; + } + else + { + return evalString(first).find(evalString(second)) != string::npos; + } + } +} + +string +Interchunk::copycase(string const &source_word, string const &target_word) +{ + wstring result; + wstring const s_word = UtfConverter::fromUtf8(source_word); + wstring const t_word = UtfConverter::fromUtf8(target_word); + + bool firstupper = iswupper(s_word[0]); + bool uppercase = firstupper && iswupper(s_word[s_word.size()-1]); + bool sizeone = s_word.size() == 1; + + if(!uppercase || (sizeone && uppercase)) + { + result = StringUtils::tolower(t_word); + } + else + { + result = StringUtils::toupper(t_word); + } + + if(firstupper) + { + result[0] = towupper(result[0]); + } + + return UtfConverter::toUtf8(result); +} + +string +Interchunk::caseOf(string const &str) +{ + wstring const s = UtfConverter::fromUtf8(str); + + if(s.size() > 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else if(!iswupper(s[s.size()-1])) + { + return "Aa"; + } + else + { + return "AA"; + } + } + else if(s.size() == 1) + { + if(!iswupper(s[0])) + { + return "aa"; + } + else + { + return "Aa"; + } + } + else + { + return "aa"; + } +} + +string +Interchunk::tolower(string const &str) const +{ + return UtfConverter::toUtf8(StringUtils::tolower(UtfConverter::fromUtf8(str))); +} + +string +Interchunk::tags(string const &str) const +{ + string result = "<"; + + for(unsigned int i = 0, limit = str.size(); i != limit; i++) + { + if(str[i] == '.') + { + result.append("><"); + } + else + { + result += str[i]; + } + } + + result += '>'; + + return result; +} + +void +Interchunk::processRule(xmlNode *localroot) +{ + // localroot is suposed to be an 'action' tag + for(xmlNode *i = localroot->children; i != NULL; i = i->next) + { + if(i->type == XML_ELEMENT_NODE) + { + processInstruction(i); + } + } +} + +TransferToken & +Interchunk::readToken(FILE *in) +{ + if(!input_buffer.isEmpty()) + { + return input_buffer.next(); + } + + wstring content; + while(true) + { + int val = fgetwc_unlocked(in); + if(feof(in) || (internal_null_flush && val == 0)) + { + return input_buffer.add(TransferToken(content, tt_eof)); + } + if(val == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val == L'[') + { + content += L'['; + while(true) + { + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L']') + { + content += L']'; + break; + } + else + { + content += wchar_t(val2); + } + } + } + else if(inword && val == L'{') + { + content += L'{'; + while(true) + { + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L'}') + { + wint_t val3 = wchar_t(fgetwc_unlocked(in)); + ungetwc(val3, in); + + content += L'}'; + if(val3 == L'$') + { + break; + } + } + else + { + content += wchar_t(val2); + } + } + } + else if(inword && val == L'$') + { + inword = false; + return input_buffer.add(TransferToken(content, tt_word)); + } + else if(val == L'^') + { + inword = true; + return input_buffer.add(TransferToken(content, tt_blank)); + } + else + { + content += wchar_t(val); + } + } +} + +bool +Interchunk::getNullFlush(void) +{ + return null_flush; +} + +void +Interchunk::setNullFlush(bool null_flush) +{ + this->null_flush = null_flush; +} + +void +Interchunk::setTrace(bool trace) +{ + this->trace = trace; +} + +void +Interchunk::interchunk_wrapper_null_flush(FILE *in, FILE *out) +{ + null_flush = false; + internal_null_flush = true; + + while(!feof(in)) + { + interchunk(in, out); + fputwc_unlocked(L'\0', out); + int code = fflush(out); + if(code != 0) + { + wcerr << L"Could not flush output " << errno << endl; + } + } + internal_null_flush = false; + null_flush = true; +} + + +void +Interchunk::interchunk(FILE *in, FILE *out) +{ + if(getNullFlush()) + { + interchunk_wrapper_null_flush(in, out); + } + + int last = 0; + + output = out; + ms.init(me->getInitial()); + + while(true) + { + if(ms.size() == 0) + { + if(lastrule != NULL) + { + applyRule(); + input_buffer.setPos(last); + } + else + { + if(tmpword.size() != 0) + { + fputwc_unlocked(L'^', output); + fputws_unlocked(tmpword[0]->c_str(), output); + fputwc_unlocked(L'$', output); + tmpword.clear(); + input_buffer.setPos(last); + input_buffer.next(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + else if(tmpblank.size() != 0) + { + fputws_unlocked(tmpblank[0]->c_str(), output); + tmpblank.clear(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + } + } + int val = ms.classifyFinals(me->getFinals()); + if(val != -1) + { + lastrule = rule_map[val-1]; + last = input_buffer.getPos(); + + if(trace) + { + wcerr << endl << L"apertium-interchunk: Rule " << val << L" "; + for (unsigned int ind = 0; ind < tmpword.size(); ind++) + { + if (ind != 0) + { + wcerr << L" "; + } + wcerr << *tmpword[ind]; + } + wcerr << endl; + } + } + + TransferToken ¤t = readToken(in); + + switch(current.getType()) + { + case tt_word: + applyWord(current.getContent()); + tmpword.push_back(¤t.getContent()); + break; + + case tt_blank: + ms.step(L' '); + tmpblank.push_back(¤t.getContent()); + break; + + case tt_eof: + if(tmpword.size() != 0) + { + tmpblank.push_back(¤t.getContent()); + ms.clear(); + } + else + { + fputws_unlocked(current.getContent().c_str(), output); + tmpblank.clear(); + return; + } + break; + + default: + cerr << "Error: Unknown input token." << endl; + return; + } + } +} + +void +Interchunk::applyRule() +{ + unsigned int limit = tmpword.size(); + + for(unsigned int i = 0; i != limit; i++) + { + if(i == 0) + { + word = new InterchunkWord *[limit]; + lword = limit; + if(limit != 1) + { + blank = new string *[limit - 1]; + lblank = limit - 1; + } + else + { + blank = NULL; + lblank = 0; + } + } + else + { + blank[i-1] = new string(UtfConverter::toUtf8(*tmpblank[i-1])); + } + + word[i] = new InterchunkWord(UtfConverter::toUtf8(*tmpword[i])); + } + + processRule(lastrule); + lastrule = NULL; + + if(word) + { + for(unsigned int i = 0; i != limit; i++) + { + delete word[i]; + } + delete[] word; + } + if(blank) + { + for(unsigned int i = 0; i != limit - 1; i++) + { + delete blank[i]; + } + delete[] blank; + } + word = NULL; + blank = NULL; + tmpword.clear(); + tmpblank.clear(); + ms.init(me->getInitial()); +} + +void +Interchunk::applyWord(wstring const &word_str) +{ + ms.step(L'^'); + for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) + { + switch(word_str[i]) + { + case L'\\': + i++; + ms.step(towlower(word_str[i]), any_char); + break; + + case L'<': + for(unsigned int j = i+1; j != limit; j++) + { + if(word_str[j] == L'>') + { + int symbol = alphabet(word_str.substr(i, j-i+1)); + if(symbol) + { + ms.step(symbol, any_tag); + } + else + { + ms.step(any_tag); + } + i = j; + break; + } + } + break; + + case L'{': // ignore the unmodifiable part of the chunk + ms.step(L'$'); + return; + + default: + ms.step(towlower(word_str[i]), any_char); + break; + } + } + ms.step(L'$'); +} Index: branches/apertium-tagger/apertium2/apertium/interchunk.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk.h (revision 69632) @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _INTERCHUNK_ +#define _INTERCHUNK_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +class Interchunk +{ +private: + + Alphabet alphabet; + MatchExe *me; + MatchState ms; + map attr_items; + map variables; + map macros; + map, Ltstr> lists; + map, Ltstr> listslow; + vector macro_map; + vector rule_map; + xmlDoc *doc; + xmlNode *root_element; + InterchunkWord **word; + string **blank; + int lword, lblank; + Buffer input_buffer; + vector tmpword; + vector tmpblank; + + FILE *output; + int any_char; + int any_tag; + + xmlNode *lastrule; + unsigned int nwords; + + map evalStringCache; + bool inword; + bool null_flush; + bool internal_null_flush; + bool trace; + string emptyblank; + + void destroy(); + void readData(FILE *input); + void readInterchunk(string const &input); + void collectMacros(xmlNode *localroot); + void collectRules(xmlNode *localroot); + string caseOf(string const &str); + string copycase(string const &source_word, string const &target_word); + + void processLet(xmlNode *localroot); + void processAppend(xmlNode *localroot); + void processOut(xmlNode *localroot); + void processCallMacro(xmlNode *localroot); + void processModifyCase(xmlNode *localroot); + bool processLogical(xmlNode *localroot); + bool processTest(xmlNode *localroot); + bool processAnd(xmlNode *localroot); + bool processOr(xmlNode *localroot); + bool processEqual(xmlNode *localroot); + bool processBeginsWith(xmlNode *localroot); + bool processBeginsWithList(xmlNode *localroot); + bool processEndsWith(xmlNode *localroot); + bool processEndsWithList(xmlNode *localroot); + bool processContainsSubstring(xmlNode *localroot); + bool processNot(xmlNode *localroot); + bool processIn(xmlNode *localroot); + void processRule(xmlNode *localroot); + string evalString(xmlNode *localroot); + void processInstruction(xmlNode *localroot); + void processChoose(xmlNode *localroot); + string processChunk(xmlNode *localroot); + + bool beginsWith(string const &str1, string const &str2) const; + bool endsWith(string const &str1, string const &str2) const; + string tolower(string const &str) const; + string tags(string const &str) const; + string readWord(FILE *in); + string readBlank(FILE *in); + string readUntil(FILE *in, int const symbol) const; + void applyWord(wstring const &word_str); + void applyRule(); + TransferToken & readToken(FILE *in); + bool checkIndex(xmlNode *element, int index, int limit); + void interchunk_wrapper_null_flush(FILE *in, FILE *out); + +public: + Interchunk(); + ~Interchunk(); + + void read(string const &transferfile, string const &datafile); + void interchunk(FILE *in, FILE *out); + bool getNullFlush(void); + void setNullFlush(bool null_flush); + void setTrace(bool trace); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/postchunk.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/postchunk.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/postchunk.h (revision 69632) @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _POSTCHUNK_ +#define _POSTCHUNK_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +class Postchunk +{ +private: + + Alphabet alphabet; + MatchExe *me; + MatchState ms; + map attr_items; + map variables; + map macros; + map, Ltstr> lists; + map, Ltstr> listslow; + vector macro_map; + vector rule_map; + xmlDoc *doc; + xmlNode *root_element; + InterchunkWord **word; + string **blank; + int lword, lblank; + Buffer input_buffer; + vector tmpword; + vector tmpblank; + + FILE *output; + int any_char; + int any_tag; + + xmlNode *lastrule; + unsigned int nwords; + + map evalStringCache; + + bool inword; + bool null_flush; + bool internal_null_flush; + + void destroy(); + void readData(FILE *input); + void readPostchunk(string const &input); + void collectMacros(xmlNode *localroot); + void collectRules(xmlNode *localroot); + static string caseOf(string const &str); + static wstring caseOf(wstring const &str); + string copycase(string const &source_word, string const &target_word); + + void processLet(xmlNode *localroot); + void processAppend(xmlNode *localroot); + void processOut(xmlNode *localroot); + void processCallMacro(xmlNode *localroot); + void processModifyCase(xmlNode *localroot); + bool processLogical(xmlNode *localroot); + bool processTest(xmlNode *localroot); + bool processAnd(xmlNode *localroot); + bool processOr(xmlNode *localroot); + bool processEqual(xmlNode *localroot); + bool processBeginsWith(xmlNode *localroot); + bool processBeginsWithList(xmlNode *localroot); + bool processEndsWith(xmlNode *localroot); + bool processEndsWithList(xmlNode *localroot); + bool processContainsSubstring(xmlNode *localroot); + bool processNot(xmlNode *localroot); + bool processIn(xmlNode *localroot); + void processRule(xmlNode *localroot); + string evalString(xmlNode *localroot); + void processInstruction(xmlNode *localroot); + void processChoose(xmlNode *localroot); + void processTags(xmlNode *localroot); + bool beginsWith(string const &str1, string const &str2) const; + bool endsWith(string const &str1, string const &str2) const; + string tolower(string const &str) const; + string tags(string const &str) const; + string readWord(FILE *in); + string readBlank(FILE *in); + string readUntil(FILE *in, int const symbol) const; + void applyWord(wstring const &word_str); + void applyRule(); + TransferToken & readToken(FILE *in); + static void unchunk(wstring const &chunk, FILE *output); + static vector getVecTags(wstring const &chunk); + static int beginChunk(wstring const &chunk); + static int endChunk(wstring const &chunk); + static void splitWordsAndBlanks(wstring const &chunk, + vector &words, + vector &blanks); + static wstring pseudolemma(wstring const &chunk); + static wstring wordzero(wstring const &chunk); + bool checkIndex(xmlNode *element, int index, int limit); + void postchunk_wrapper_null_flush(FILE *in, FILE *out); + +public: + Postchunk(); + ~Postchunk(); + + void read(string const &transferfile, string const &datafile); + void postchunk(FILE *in, FILE *out); + bool getNullFlush(void); + void setNullFlush(bool null_flush); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/tagger_data_hmm.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_data_hmm.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_data_hmm.cc (revision 69632) @@ -0,0 +1,403 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include + +using namespace Apertium; + +void +TaggerDataHMM::destroy() +{ + if(a != NULL) + { + for(int i = 0; i != N; i++) + { + delete [] a[i]; + } + delete [] a; + } + a = NULL; + + if(b != NULL) + { + for(int i = 0; i != N; i++) + { + delete [] b[i]; + } + delete [] b; + } + b = NULL; + + N = 0; + M = 0; +} + +TaggerDataHMM::TaggerDataHMM() +{ + a = NULL; + b = NULL; + N = 0; + M = 0; +} + +TaggerDataHMM::~TaggerDataHMM() +{ + destroy(); +} + +TaggerDataHMM::TaggerDataHMM(TaggerDataHMM const &o) +{ + a = NULL; + b = NULL; + N = 0; + M = 0; + + TaggerData::copy(o); + this->setProbabilities(o.N, o.M, o.a, o.b); +} + +TaggerDataHMM::TaggerDataHMM(TaggerData const &o) +{ + + a = NULL; + b = NULL; + N = 0; + M = 0; + + TaggerData::copy(o); +} + +TaggerDataHMM & +TaggerDataHMM::operator =(TaggerDataHMM const &o) +{ + if(this != &o) + { + destroy(); + TaggerData::copy(o); + this->setProbabilities(o.N, o.M, o.a, o.b); + } + return *this; +} + +void +TaggerDataHMM::setProbabilities(int const myN, int const myM, + double **myA, double **myB) +{ + this->destroy(); + N = myN; + M = myM; + + if(N != 0 && M != 0) + { + // NxN matrix + a = new double * [N]; + for(int i = 0; i != N; i++) + { + a[i] = new double[N]; + if(myA != NULL) + { + for(int j = 0; j != N; j++) // ToDo: N should be M? Check use of N and M in this function + { + a[i][j] = myA[i][j]; + } + } + } + + // NxM matrix + b = new double * [N]; + for(int i = 0; i != N; i++) + { + b[i] = new double[M]; + if(myB != NULL) + { + for(int j = 0; j != M; j++) + { + b[i][j] = myB[i][j]; + } + } + } + } + else + { + a = NULL; + b = NULL; + } +} + +double ** +TaggerDataHMM::getA() +{ + return a; +} + +double ** +TaggerDataHMM::getB() +{ + return b; +} + +int +TaggerDataHMM::getN() +{ + return N; +} + +int +TaggerDataHMM::getM() +{ + return M; +} + +void +TaggerDataHMM::read(FILE *in) +{ + destroy(); + + // open_class + int val = 0; + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + val += Compression::multibyte_read(in); + open_class.insert(val); + } + + // forbid_rules + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + TForbidRule aux; + aux.tagi = Compression::multibyte_read(in); + aux.tagj = Compression::multibyte_read(in); + forbid_rules.push_back(aux); + } + + + // array_tags + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + array_tags.push_back(Compression::wstring_read(in)); + } + + // tag_index + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + wstring tmp = Compression::wstring_read(in); + tag_index[tmp] = Compression::multibyte_read(in); + } + + // enforce_rules + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + TEnforceAfterRule aux; + aux.tagi = Compression::multibyte_read(in); + for(int j = Compression::multibyte_read(in); j != 0; j--) + { + aux.tagsj.push_back(Compression::multibyte_read(in)); + } + enforce_rules.push_back(aux); + } + + // prefer_rules + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + prefer_rules.push_back(Compression::wstring_read(in)); + } + + // constants + constants.read(in); + + // output + output.read(in); + + // dimensions + N = Compression::multibyte_read(in); + M = Compression::multibyte_read(in); + + + a = new double * [N]; + b = new double * [N]; + for(int i = 0; i != N; i++) + { + a[i] = new double[N]; + b[i] = new double[M]; + } + + // read a + for(int i = 0; i != N; i++) + { + for(int j = 0; j != N; j++) + { + a[i][j] = EndianDoubleUtil::read(in); + } + } + + // initializing b matrix + for(int i = 0 ; i != N; i++) + { + for(int j = 0; j != M; j++) + { + b[i][j] = ZERO; + } + } + + // read nonZERO values of b + int nval = Compression::multibyte_read(in); + + for(; nval != 0; --nval) + { + int i = Compression::multibyte_read(in); + int j = Compression::multibyte_read(in); + b[i][j] = EndianDoubleUtil::read(in); + } + + // read pattern list + plist.read(in); + + // read discards on ambiguity + discard.clear(); + + int limit = Compression::multibyte_read(in); + if(feof(in)) + { + return; + } + + for(int i = 0; i < limit; i++) + { + discard.push_back(Compression::wstring_read(in)); + } +} + +void +TaggerDataHMM::write(FILE *out) +{ + + // open_class + Compression::multibyte_write(open_class.size(), out); + int val = 0; + for(set::const_iterator it = open_class.begin(), limit = open_class.end(); + it != limit; it++) + { + Compression::multibyte_write(*it-val, out); + val = *it; + } + + // forbid_rules + Compression::multibyte_write(forbid_rules.size(), out); + for(unsigned int i = 0, limit = forbid_rules.size(); i != limit; i++) + { + Compression::multibyte_write(forbid_rules[i].tagi, out); + Compression::multibyte_write(forbid_rules[i].tagj, out); + } + + // array_tags + Compression::multibyte_write(array_tags.size(), out); + for(unsigned int i = 0, limit = array_tags.size(); i != limit; i++) + { + Compression::wstring_write(array_tags[i], out); + } + + // tag_index + Compression::multibyte_write(tag_index.size(), out); + for(map::iterator it = tag_index.begin(), limit = tag_index.end(); + it != limit; it++) + { + Compression::wstring_write(it->first, out); + Compression::multibyte_write(it->second, out); + } + + // enforce_rules + Compression::multibyte_write(enforce_rules.size(), out); + for(unsigned int i = 0, limit = enforce_rules.size(); i != limit; i++) + { + Compression::multibyte_write(enforce_rules[i].tagi, out); + Compression::multibyte_write(enforce_rules[i].tagsj.size(), out); + for(unsigned int j = 0, limit2 = enforce_rules[i].tagsj.size(); j != limit2; j++) + { + Compression::multibyte_write(enforce_rules[i].tagsj[j], out); + } + } + + // prefer_rules + Compression::multibyte_write(prefer_rules.size(), out); + for(unsigned int i = 0, limit = prefer_rules.size(); i != limit; i++) + { + Compression::wstring_write(prefer_rules[i], out); + } + + // constants + constants.write(out); + + // output + output.write(out); + + // a matrix + Compression::multibyte_write(N, out); + Compression::multibyte_write(M, out); + for(int i = 0; i != N; i++) + { + for(int j = 0; j != N; j++) + { + EndianDoubleUtil::write(out, a[i][j]); + } + } + + // b matrix, writing only useful values + + int nval = 0; + for(int i = 0; i != N; i++) + { + for(int j = 0; j != M; j++) + { + if(output[j].find(i) != output[j].end()) + { + nval++; + } + } + } + + Compression::multibyte_write(nval, out); + for(int i = 0; i != N; i++) + { + for(int j = 0; j != M; j++) + { + if(output[j].find(i) != output[j].end()) + { + Compression::multibyte_write(i, out); + Compression::multibyte_write(j, out); + EndianDoubleUtil::write(out, b[i][j]); + } + } + } + + // write pattern list + plist.write(out); + + // write discard list + + if(discard.size() != 0) + { + Compression::multibyte_write(discard.size(), out); + for(unsigned int i = 0, limit = discard.size(); i != limit; i++) + { + Compression::wstring_write(discard[i], out); + } + } +} + Index: branches/apertium-tagger/apertium2/apertium/tagger_word.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_word.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_word.cc (revision 69632) @@ -0,0 +1,384 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include "apertium_config.h" +#include + +using namespace Apertium; + +bool TaggerWord::generate_marks=false; + +vector TaggerWord::array_tags; + +bool TaggerWord::show_ignored_string=true; + +map TaggerWord::patterns; + +TaggerWord::TaggerWord(bool prev_plus_cut) : +show_sf(false) +{ + ignored_string = L""; + plus_cut=false; + previous_plus_cut=prev_plus_cut; +} + +TaggerWord::TaggerWord(const TaggerWord &w){ + superficial_form = w.superficial_form; + tags = w.tags; + show_sf = false; + lexical_forms = w.lexical_forms; + ignored_string = w.ignored_string; + plus_cut = w.plus_cut; + previous_plus_cut=w.previous_plus_cut; +} + +TaggerWord::~TaggerWord(){ +} + +void +TaggerWord::set_show_sf(bool sf){ + show_sf = sf; +} + +bool +TaggerWord::get_show_sf(){ + return show_sf; +} + +void +TaggerWord::set_superficial_form(const wstring &sf){ + superficial_form = sf; +} + +wstring& +TaggerWord::get_superficial_form() { + return superficial_form; +} + +bool +TaggerWord::match(wstring const &s, wstring const &pattern) +{ + map::iterator it = patterns.find(pattern); + string const utfs = UtfConverter::toUtf8(s); + + if(it == patterns.end()) + { + string utfpattern = UtfConverter::toUtf8(pattern); + string regexp = ""; + + while(true) + { + size_t pos = utfpattern.find("<*>"); + if(pos == string::npos) + { + break; + } + utfpattern.replace(pos, 3, "(<[^>]+>)+"); + } + patterns[pattern].compile(utfpattern); + return patterns[pattern].match(utfs) != ""; + } + else + { + return it->second.match(utfs) != ""; + } +} + +void +TaggerWord::add_tag(TTag &t, const wstring &lf, vector const &prefer_rules){ + + //Tag is added only is it is not present yet + //Sometime one word can have more than one lexical form assigned to the same tag + if (tags.find(t)==tags.end()) { + tags.insert(t); + lexical_forms[t]=lf; + } else { + //Take a look at the prefer rules + for(int i=0; i < (int) prefer_rules.size(); i++) + { + if (match(lf, prefer_rules[i])) + { + lexical_forms[t]=lf; + break; + } + } + } +} + +set& +TaggerWord::get_tags() { + return tags; +} + +bool +TaggerWord::isAmbiguous() const +{ + return tags.size() > 1; +} + +wstring +TaggerWord::get_string_tags() { + wstring st; + set::iterator itag = tags.begin(); + + st=L"{"; + for(itag=tags.begin(); itag!=tags.end(); itag++) { + if (itag!=tags.begin()) + st+=L','; + st+=array_tags[*itag]; + } + st += L'}'; + + return st; +} + +wstring +TaggerWord::get_lexical_form(TTag &t, int const TAG_kEOF) { + wstring ret= L""; + + if (show_ignored_string) + ret.append(ignored_string); + + if(t==TAG_kEOF) + return ret; + + if (!previous_plus_cut){ + if(TaggerWord::generate_marks && isAmbiguous()) + { + ret.append(L"^="); + } + else + { + ret += L'^'; + } + + if(get_show_sf()){ // append the superficial form + ret.append(superficial_form); + ret+=L'/'; + } + } + + if (lexical_forms.size()==0) { // This is an UNKNOWN WORD + ret +=L'*'; + ret.append(superficial_form); + } else if ((*lexical_forms.begin()).second[0]==L'*') { //This is an + //unknown word + //that has + //been guessed + ret += L'*'; + ret.append(superficial_form); + } else if (lexical_forms.size()>1) { //This is an ambiguous word + ret.append(lexical_forms[t]); + } else { + ret.append(lexical_forms[t]); + } + + if (ret != ignored_string) { + if (plus_cut) + ret+=L'+'; + else { + ret += L'$'; + } + } + + + //if ((superficial_form.length()>0)&&(superficial_form[superficial_form.length()-1]=='\'')) + // //Si la forma superficial termina en apostrofo metemos un espacio en blanco tras la cadena '/$' + // //o '/'. De no hacerlo en la traducción aparecerán dos palabras sin blanco alguno. + // ret+=" "; //Quizá este no sea el sitio apropiado para hacer esto, lo suyo sería un módulo + // //antes del tagger o del anmor. + + return ret; +} + +wstring +TaggerWord::get_all_chosen_tag_first(TTag &t, int const TAG_kEOF) { + wstring ret=L""; + + if (show_ignored_string) + ret.append(ignored_string); + + if(t==TAG_kEOF) + return ret; + + if (!previous_plus_cut) + { + if(TaggerWord::generate_marks && isAmbiguous()) + { + ret.append(L"^="); + } + else + { + ret += L'^'; + } + } + + ret.append(superficial_form); + + if (lexical_forms.size()==0) { // This is an UNKNOWN WORD + ret+=L"/*"; + ret.append(superficial_form); + } else { + ret+=L"/"; + ret.append(lexical_forms[t]); + if (lexical_forms.size()>1) { + set::iterator it; + for (it=tags.begin(); it!=tags.end(); it++) { + if (*it != t) { + ret+=L"/"; + ret.append(lexical_forms[*it]); + } + } + } + } + + if (ret != ignored_string) { + if (plus_cut) + ret+=L"+"; + else { + ret+=L"$"; + } + } + + return ret; +} + +//OBSOLETE +wstring +TaggerWord::get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF) { + wstring ret; + + if(t==TAG_kEOF) + return ret; + + if (lexical_forms.size()==0) { //This is an unknown word + ret.append(L"*^"); + ret.append(superficial_form); + } else if ((*lexical_forms.begin()).second[0]=='*') { //This is an unknown word that has been guessed + ret.append(L"*^"); + ret.append(superficial_form); + } else { + ret += L'^'; + ret.append(lexical_forms[t]); + } + + if (ret.length() != 0) { + if (plus_cut) + ret+=L'+'; + else { + ret +=L'$'; + } + } + + return ret; +} + +void +TaggerWord::add_ignored_string(wstring const &s) { + ignored_string.append(s); +} + +void +TaggerWord::set_plus_cut(const bool &c) { + plus_cut=c; +} + +bool +TaggerWord::get_plus_cut() { + return plus_cut; +} + +wostream& +operator<< (wostream& os, TaggerWord &w) { + os< const &at) +{ + array_tags = at; +} + +void +TaggerWord::print() +{ + wcout << L"[#" << superficial_form << L"# "; + for(set::iterator it=tags.begin(), limit = tags.end(); it != limit; it++) + { + wcout << L"(" << *it << L" " << lexical_forms[*it] << L") "; + } + wcout << L"\b]\n"; +} + +void +TaggerWord::outputOriginal(FILE *output) { + + wstring s=superficial_form; + + map::iterator it; + for(it=lexical_forms.begin(); it!=lexical_forms.end(); it++) { + if (it->second.length()>0) + { + s+=L'/'; + s.append(it->second); + } + } + + if (s.length()>0) + { + s=L"^"+s+L"$\n"; + } + + fputws_unlocked(s.c_str(), output); +} + +void +TaggerWord::discardOnAmbiguity(wstring const &tags) +{ + if(isAmbiguous()) + { + map::iterator it = lexical_forms.begin(), + limit = lexical_forms.end(); + set newsettag; + while(it != limit) + { + if(match(it->second, tags)) + { + lexical_forms.erase(it); + it = lexical_forms.begin(); + } + else + { + newsettag.insert(it->first); + } + + if(lexical_forms.size() == 1) + { + newsettag.insert(lexical_forms.begin()->first); + break; + } + it++; + } + if(tags.size() != newsettag.size()) + { + this->tags = newsettag; + } + } +} Index: branches/apertium-tagger/apertium2/apertium/tmx_align_parameters.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_align_parameters.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_align_parameters.h (revision 69632) @@ -0,0 +1,51 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __TMXALIGNER_ALIGN_PARAMETERS_H +#define __TMXALIGNER_ALIGN_PARAMETERS_H + + +class AlignParameters +{ +public: + enum RealignType { NoRealign, ModelOneRealign, FineTranslationRealign }; + + bool justSentenceIds; + bool justBisentences; + + bool cautiousMode; + RealignType realignType; + double qualityThreshold; + + double postprocessTrailQualityThreshold; + double postprocessTrailStartAndEndQualityThreshold; + double postprocessTrailByTopologyQualityThreshold; + + std::string handAlignFilename; + + bool utfCharCountingMode; + + std::string autoDictionaryDumpFilename; // Empty string means do not dump. + +AlignParameters() : justSentenceIds(true), + justBisentences(false), cautiousMode(false), + realignType(NoRealign), + qualityThreshold(-100000), + postprocessTrailQualityThreshold(-1), + postprocessTrailStartAndEndQualityThreshold(-1), + postprocessTrailByTopologyQualityThreshold(-1), + utfCharCountingMode(false) + {} + + +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/tmx_alignment.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_alignment.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_alignment.cc (revision 69632) @@ -0,0 +1,614 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#include + +#include // For SentenceList +#include // For FrequencyMap + +#include +#include +#include +#include + +// Copypaste-elve. TODO Elhelyezni. +#define massert(e) if (!(e)) { std::cerr << #e << " failed" << std::endl; throw "assert"; } + +std::ostream& operator<<( std::ostream& os, std::pair p ) +{ + os << p.first << "," << p.second; + return os; +} + +namespace TMXAligner +{ + + +// Attention, the two-sentence length is the first argument. Usually the Hungarian is, but not here. +// The bigger the better. closeness is always smaller than bestScore. +double closeness( double twoSentenceLength, double oneSentenceLength ) +{ + const double bestScore = 0.3; + const double quasiglobal_closenessMultiplier = 0.3; + + double ratio; + + if (twoSentenceLength>oneSentenceLength) + { + ratio = (twoSentenceLength+1)/(oneSentenceLength+1); + } + else + { + ratio = (oneSentenceLength+1)/(twoSentenceLength+1); + } + + ratio -= 1.0; + + // assert(ratio>=0); + return bestScore - quasiglobal_closenessMultiplier * ratio; +} + +const unsigned char Diag = 1; +const unsigned char HuSkip = 2; +const unsigned char EnSkip = 3; +const unsigned char HuHuEnSkip = 4; +const unsigned char HuEnEnSkip = 5; +const unsigned char Dead = 6; + +void buildDynProgMatrix( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength, + QuasiDiagonal& v, TrelliMatrix& trellis ) +{ + const int huBookSize = w.size(); + + + int huPos,enPos; + + // v[huPos][enPos] gives the similarity of the [0,huPos) and [0,enPos) intervals. + // The smaller value, the better similarity. (Unlike in the original similarity matrix w, where bigger is better.) + + double infinity = 1e6; + + for ( huPos=0; huPos<=huBookSize; ++huPos ) + { + int rowStart = v.rowStart(huPos); + int rowEnd = v.rowEnd(huPos); + for ( enPos=rowStart; enPos0) + { + values[HuSkip] = v[huPos-1][enPos] - skipScore; + } + + if (enPos>0) + { + values[EnSkip] = v[huPos][enPos-1] - skipScore; + } + + if ((huPos>0) && (enPos>0)) + { + if (quasiglobal_lengthFitnessApplied) + { + lengthFitness = closeness(huLength[huPos-1], enLength[enPos-1]); + } + else + { + lengthFitness = 0; + } + + values[Diag] = v[huPos-1][enPos-1] - w[huPos-1][enPos-1] - lengthFitness ; + } + + const double dotLength = 2.0 ; + + if ((huPos>1) && (enPos>0)) + { + if (quasiglobal_lengthFitnessApplied) + { + lengthFitness = closeness(huLength[huPos-2]+huLength[huPos-1]+dotLength, enLength[enPos-1]); + } + else + { + lengthFitness = 0; + } + + } + + if ((huPos>0) && (enPos>1)) + { + if (quasiglobal_lengthFitnessApplied) + { + // Attention, the two-sentence length is the first argument. Usually the Hungarian is the first argument, but not here. + lengthFitness = closeness(enLength[enPos-2]+enLength[enPos-1]+dotLength, huLength[huPos-1]); + } + else + { + lengthFitness = 0; + } + + const double& a = w[huPos-1][enPos-1] ; + const double& b = w[huPos-1][enPos-2] ; + values[HuEnEnSkip] = v[huPos-1][enPos-2] - ( a *syt ) + ++syt; + else + { + ++inter; + ++sxt; + ++syt; + } + } + return inter; +} + + +// A bit of an abuse of the fact that Trail and BisentenceList are typedef'd to the same structure. +double scoreTrailOrBisentenceList( const Trail& trailAuto, const Trail& trailHand ) +{ + int score = countIntersectionOfTrails( trailAuto, trailHand ); + + std::cerr << trailAuto.size()-score << " misaligned out of " << trailHand.size() << " correct items, " + << trailAuto.size() << " bets." << std::endl; + + std::cerr << "Precision: " << 1.0*score/trailAuto.size() + << ", Recall: " << 1.0*score/trailHand.size() << std::endl; + + double ratio = 1.0*(trailAuto.size()-score)/trailAuto.size(); + return ratio; +} + + +void trailToBisentenceList( const Trail& bestTrail, + BisentenceList& bisentenceList ) +{ + bisentenceList.clear(); + + int trailSize = bestTrail.size(); + + for ( int pos=0; pos=0) && (x=0) && (y +void dumpAlignMatrix( const QuasiDiagonal& alignMatrix ) +{ + int huPos,enPos; + + int huBookSize = alignMatrix.size(); + int enBookSize = alignMatrix.otherSize(); + + for ( huPos=0; huPos=end) ) + { + std::cout << "-1\t"; + continue; + } + + std::cout << alignMatrix[huPos][enPos] << "\t"; + } + std::cout << std::endl; + } +} + +void dumpAlignMatrix( const QuasiDiagonal& alignMatrix, bool graphical ) +{ + int huPos,enPos; + + int huBookSize = alignMatrix.size(); + int enBookSize = alignMatrix.otherSize(); + + for ( huPos=0; huPos=end) ) + { + if (graphical) + { + std::cout << " "; + } + else + { + std::cout << "-1\t"; + } + continue; + } + + if (graphical) + { + char c(' '); + switch (alignMatrix[huPos][enPos]) + { + case 0: c=' '; break; + case 1: c='.'; break; + case 2: c=':'; break; + case 3: c='|'; break; + case 4: c='+'; break; + default: c='X'; break; + } + std::cout << c << " "; + } + else + { + std::cout << alignMatrix[huPos][enPos] << "\t"; + } + } + std::cout << std::endl; + } +} + +void dumpTrelliMatrix( const TrelliMatrix& trellis ) +{ + std::map directions; + + directions[Diag] = "HuEn"; + directions[HuSkip] = "Hu"; + directions[EnSkip] = "En"; + directions[HuHuEnSkip] = "HuHuEn"; + directions[HuEnEnSkip] = "HuEnEn"; + directions[Dead] = "Dead"; + + int huPos,enPos; + + int huBookSize = trellis.size(); + int enBookSize = trellis.otherSize(); + + for ( huPos=0; huPos=end) ) + { + std::cout << "-1\t"; + continue; + } + + std::cout << directions[trellis[huPos][enPos]] << "\t"; + } + std::cout << std::endl; + } +} + +} // namespace TMXAligner Index: branches/apertium-tagger/apertium2/apertium/tmx_builder.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_builder.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_builder.cc (revision 69632) @@ -0,0 +1,1027 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include "apertium_config.h" +#include + +#ifdef _MSC_VER +#include +#include +#endif + +using namespace Apertium; +using namespace std; + +TMXBuilder::TMXBuilder(wstring const &l1, wstring const &l2): +low_limit(0) +{ + lang1 = l1; + lang2 = l2; + + // default values of the parameters + + max_edit = 50; + diagonal_width = 10; + window_size = 100; + step = 75; + percent=0.85; + edit_distance_percent=0.30; + + freference = NULL; +} + +TMXBuilder::~TMXBuilder() +{ +} + +wstring +TMXBuilder::restOfBlank(FILE *input) +{ + wstring result = L"["; + + while(true) + { + wint_t val = fgetwc(input); + if(feof(input)) + { + return L""; + } + switch(val) + { + case L'\\': + result += L'\\'; + val = fgetwc(input); + if(feof(input)) + { + return L""; + } + result += static_cast(val); + break; + + case L']': + result += L']'; + return result; + + default: + result += static_cast(val); + break; + } + } + + return L""; +} + +wstring +TMXBuilder::nextBlank(FILE *input) +{ + wstring result = L""; + + while(true) + { + wint_t val = fgetwc(input); + if(feof(input)) + { + return L""; + } + switch(val) + { + case L'\\': + fgetwc(input); + break; + case L'[': + + result = restOfBlank(input); + return result; + } + } +} + +bool +TMXBuilder::compatible(FILE *f1, FILE *f2, bool lazy) +{ + wstring s1 = nextBlank(f1), s2 = nextBlank(f2); + if(!lazy) + { + while(!feof(f1) && !feof(f2)) + { + if(s1 != s2) + { + return false; + } + s1 = nextBlank(f1); + s2 = nextBlank(f2); + } + } + else + { + while(!feof(f1) && !feof(f2)) + { + if(s1.size() < s2.size()*(1-0.05) || s1.size() > s2.size()*(1+0.05)) + { + return false; + } + s1 = nextBlank(f1); + s2 = nextBlank(f2); + } + } + return true; +} + +bool +TMXBuilder::check(string const &file1, string const &file2, bool lazy) +{ + FILE *f1 = fopen(file1.c_str(), "rb"); + FILE *f2 = fopen(file2.c_str(), "rb"); + if(!f1 && !f2) + { + wcerr << L"Error: Cannot access files '" << UtfConverter::fromUtf8(file1); + wcerr << L"' and '" << UtfConverter::fromUtf8(file2) << "'" << endl; + return false; + } + else if(!f1) + { + wcerr << L"Error: Cannot access file '"; + wcerr << UtfConverter::fromUtf8(file2); + wcerr << "'" << endl; + fclose(f2); + return false; + } + else if(!f2) + { + wcerr << L"Error: Cannot access file '"; + wcerr << UtfConverter::fromUtf8(file2); + wcerr << "'" << endl; + fclose(f1); + return false; + } + + bool retval = compatible(f1, f2, lazy); + + fclose(f1); + fclose(f2); + return retval; +} + +wstring +TMXBuilder::nextTU(FILE *input) +{ + wstring current_tu = L""; + wstring tmp; + + while(true) + { + wint_t symbol = fgetwc_unlocked(input); + if(feof(input)) + { + if(current_tu == L"") + { + return L""; + } + else + { + return current_tu; + } + } + switch(symbol) + { + case L'\\': + symbol = fgetwc_unlocked(input); + if(feof(input)) + { + if(current_tu == L"") + { + return L""; + } + else + { + return current_tu; + } + } + // continued down + default: + current_tu += static_cast(symbol); + break; + + case L'[': + tmp = restOfBlank(input); + if(tmp.substr(0,2) == L"[ ") + { + current_tu.append(L" "); + } + current_tu.append(L""); + if(tmp.substr(tmp.size()-2, 2) == L" ]") + { + current_tu.append(L" "); + } + break; + + case L'.': + current_tu += L'.'; + symbol = fgetwc_unlocked(input); + + if(symbol != L'[' && !iswspace(symbol)) + { + if(!feof(input)) + { + ungetwc(symbol, input); + } + } + else + { + if(!feof(input)) + { + ungetwc(symbol, input); + } + + return current_tu; +/* size_t idx = current_tu.size()-1; + while(current_tu[idx] == L'.') + { + idx--; + } + return current_tu.substr(0, idx+1);*/ + } + break; + + case L'?': + case L'!': + current_tu += static_cast(symbol); + return current_tu; + } + } + + return current_tu; +} + +wstring +TMXBuilder::xmlize(wstring const &str) +{ + wstring result = L""; + + for(size_t i = 0, limit = str.size(); i < limit; i++) + { + switch(str[i]) + { + case L'<': + if(i + 5 <= limit && str.substr(i,5)==L"") + { + result.append(L""); + i += 4; + break; + } + else + { + result.append(L"<"); + } + break; + + case L'>': + result.append(L">"); + break; + + case L'&': + result.append(L"&"); + break; + + default: + result += str[i]; + break; + } + } + + // remove leading 's + + bool cambio = true; + while(cambio == true) + { + cambio = false; + while(result.size() >= 5 && result.substr(0,5) == L"") + { + result = result.substr(5); + cambio = true; + } + while(result.size() > 0 && !iswalnum(result[0]) && !iswpunct(result[0])) + { + result = result.substr(1); + cambio = true; + } + } + // remove trailing 's + + cambio = true; + while(cambio == true) + { + cambio = false; + while(result.size() > 5 && result.substr(result.size()-5) == L"") + { + result = result.substr(0, result.size()-5); + cambio = true; + } + while(result.size() > 0 && !iswalnum(result[result.size()-1]) && !iswpunct(result[result.size()-1])) + { + result = result.substr(0, result.size()-1); + cambio = true; + } + } + + // remove trailing punctuation + + + for(unsigned int i = result.size()-1; result.size() > 0 && i > 0; i--) + { + if(!isRemovablePunct(result[i])) + { + result = result.substr(0, i+1); + break; + } + } + + while(result.size() > 0 && isRemovablePunct(result[result.size()-1])) + { + result = result.substr(0,result.size()-1); + } + + return result; +} + +void +TMXBuilder::generate(string const &file1, string const &file2, + string const &outfile) +{ + FILE *output = stdout; + + if(outfile != "") + { + output = fopen(outfile.c_str(), "w"); + if(!output) + { + wcerr << L"Error: file '" << UtfConverter::fromUtf8(outfile); + wcerr << L"' cannot be opened for writing" << endl; + exit(EXIT_FAILURE); + } + } +#ifdef _MSC_VER + _setmode(_fileno(output), _O_U8TEXT); +#endif + + FILE *f1 = fopen(file1.c_str(), "r"); + if(!f1) + { + wcerr << L"Error: file '" << UtfConverter::fromUtf8(file1); + wcerr << L"' cannot be opened for reading" << endl; + exit(EXIT_FAILURE); + } + + FILE *f2 = fopen(file2.c_str(), "r"); + if(!f2) + { + wcerr << L"Error: file '" << UtfConverter::fromUtf8(file2); + wcerr << L"' cannot be opened for reading" << endl; + exit(EXIT_FAILURE); + } + +#ifdef _MSC_VER + _setmode(_fileno(f1), _O_U8TEXT); + _setmode(_fileno(f2), _O_U8TEXT); +#endif + + generateTMX(f1, f2, output); +} + +vector +TMXBuilder::reverseList(vector const &v) +{ + vector retval(v.size()); + + for(int j = v.size() - 1, i = 0; j >=0; j--, i++) + { + retval[i] = v[j]; + } + + return retval; +} + +vector +TMXBuilder::sentenceList(FILE *file) +{ + vector retval; + + while(true) + { + wstring f = nextTU(file); + if(feof(file)) + { + break; + } + retval.push_back(f); + } + + return retval; +} + +vector +TMXBuilder::extractFragment(vector const &text, unsigned int base, unsigned int width) +{ + vector result; + + for(unsigned int i = base; i < (base + width) && i < text.size(); i++) + { + result.push_back(text[i]); + } + + return result; +} + +int +TMXBuilder::argmin(int nw, int n, int w) +{ + if(nw <= n) + { + if(nw <= w) + { + return 1; + } + else + { + return 3; + } + } + else if(n <= w) + { + return 2; + } + else + { + return 3; + } +} + +void +TMXBuilder::generateTMX(FILE *f1, FILE *f2, FILE *output) +{ + fprintf(output, "\n"); + fprintf(output, "\n"); + fprintf(output, "
\n"); + fprintf(output, "
\n"); + fprintf(output, "\n"); + outputTU(f1, f2, output); + fprintf(output, "\n
\n"); + +} + +void +TMXBuilder::printTable(int *table, unsigned int nrows, unsigned int ncols) +{ + for(unsigned int i = 0; i < nrows; i++) + { + for(unsigned int j = 0; j < ncols; j++) + { + if(j != 0) + { + wcerr << L" "; + } + wcerr << setw(10) << table[i*ncols + j]; + } + wcerr << endl; + } +} + + +void +TMXBuilder::printTUCond(FILE *output, wstring const &tu1, wstring const &tu2, bool secure_zone) +{ + if(secure_zone && similar(tu1, tu2)) + { + printTU(output, tu1, tu2); + } +} + +void +TMXBuilder::splitAndMove(FILE *f1, string const &filename) +{ + FILE *stream = fopen(filename.c_str(), "w"); + vector fichero_por_cadenas = sentenceList(f1); + for(size_t i = 0; i < fichero_por_cadenas.size(); i++) + { + fputws_unlocked(fichero_por_cadenas[i].c_str(), stream); + fputws_unlocked(L"\n", stream); + } + fclose(stream); +} + +void +TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output) +{ + string left = tmpnam(NULL); + string right = tmpnam(NULL); + string out = tmpnam(NULL); + + splitAndMove(f1, left); + fclose(f1); + + splitAndMove(f2, right); + fclose(f2); + + TMXAligner::DictionaryItems dict; + AlignParameters ap; + + ap.justSentenceIds = false; + ap.utfCharCountingMode = false; + ap.realignType=AlignParameters::NoRealign; + + TMXAligner::alignerToolWithFilenames(dict, left, right, ap, out); + + FILE *stream = fopen(out.c_str(), "r"); + int conta = 0; + wstring partes[2]; + while(true) + { + wchar_t val = fgetwc(stream); + if(feof(stream)) + { + break; + } + + if(val == L'\t') + { + conta++; + } + else if(val == L'\n') + { + if(partes[0] != L"" && partes[1] != L"") + { + printTU(output, partes[0], partes[1]); + } + partes[0] = L""; + partes[1] = L""; + conta = 0; + } + if(conta < 2) + { + partes[conta] += val; + } + } + + unlink(left.c_str()); + unlink(right.c_str()); + unlink(out.c_str()); + + /* + + + int base_i = 0, base_j = 0; + + vector lista1 = reverseList(sentenceList(f1)), + lista2 = reverseList(sentenceList(f2)), lista3; + + if(freference != NULL) + { + lista3 = reverseList(sentenceList(freference)); + } + + while(true) + { + vector l1 = extractFragment(lista1, base_i, window_size); + vector l2 = extractFragment(lista2, base_j, window_size) , l3; + + if(lista3.size() != 0) + { + l3 = extractFragment(lista3, base_j, window_size); + } + + int *table; + if(lista3.size() == 0) + { + table = levenshteinTable(l1, l2, diagonal_width, max_edit); + } + else + { + table = levenshteinTable(l1, l3, diagonal_width, max_edit); + } + + unsigned int const nrows = l1.size() + 1; + unsigned int const ncols = l2.size() + 1; + unsigned int i = nrows - 1; + unsigned int j = ncols - 1; + + + // printTable(table, nrows, ncols); + + bool newBase = false; + + + while(true) + { + int v = argmin(table[(i-1)*ncols + j-1], // i-1, j-1 + table[(i-1)*ncols + j], // i-j, j + table[i*ncols + j-1]); // i, j-1 + switch(v) + { + case 1: + i--; + j--; + + if(l3.size() == 0) + { + if((newBase || l1.size() < step) && similar(l1[i], l2[j])) + { + printTU(output, l1[i], l2[j]); + } + } + else + { + if((newBase || l1.size() < step) && similar(l1[i], l3[j])) + { + printTU(output, l1[i], l2[j]); + } + } + break; + + case 2: + i--; + if(i > 2 && argmin(table[(i-1)*ncols + j-1], + table[(i-1)*ncols + j], + table[i*ncols + j-1]) == 3 && + argmin(table[(i-1)*ncols + j-2], + table[(i-1)*ncols + j-1], + table[i*ncols + j-2]) != 1) + { + if(l3.size() == 0) + { + if((newBase || l1.size() < step) && similar(l1[i], l2[j])) + { + printTU(output, l1[i], l2[j]); + } + } + else + { + if((newBase || l1.size() < step) && similar(l1[i], l3[j])) + { + printTU(output, l1[i], l2[j]); + } + } + } + + // wcerr << L"[" << i << L" " << j << L"]" << endl; + break; + + case 3: + j--; + if(j > 2 && argmin(table[(i-1)*ncols + j-1], + table[(i-1)*ncols + j], + table[i*ncols + j-1]) == 1 && + argmin(table[(i-1)*ncols + j-2], + table[(i-1)*ncols + j-1], + table[i*ncols + j-2]) != 3) + { + if(l3.size() == 0) + { + if((newBase || l1.size() < step) && similar(l1[i], l2[j])) + { + printTU(output, l1[i], l2[j]); + } + } + else + { + if((newBase || l1.size() < step) && similar(l1[i], l3[j])) + { + printTU(output, l1[i], l2[j]); + } + } + } + + + break; + + default: + // error + break; + } + + if(i == step && !newBase) + { + base_i += i; + base_j += j; + newBase = true; + } + + if(i == 0 || j == 0) + { + break; + } + } + + delete[] table; + + if(l1.size() < window_size) + { + break; + } + }*/ +} + +int +TMXBuilder::weight(wstring const &s) +{ + return s.size()*2; // just the size of the string +} + +int * +TMXBuilder::levenshteinTable(vector &l1, vector &l2, + unsigned int diagonal_width, unsigned int max_edit) +{ + unsigned int const nrows = l1.size() + 1; + unsigned int const ncols = l2.size() + 1; + + int *table = new int[nrows * ncols]; + + table[0] = 0; + + for(unsigned int i = 1; i < nrows; i++) + { + table[i*ncols] = table[(i-1)*ncols] + weight(l1[i-1]); + } + + for(unsigned int j = 1; j < ncols; j++) + { + table[j] = table[j-1] + weight(l2[j-1]); + } + + for(unsigned int i = 1; i < nrows; i++) + { + for(unsigned int j = 1; j < ncols; j++) + { + int ed = 0; + + if(i > (j + diagonal_width)) + { + ed = table[i*ncols]+table[j]; + } + else if(j > (i + diagonal_width)) + { + ed = table[i*ncols]+table[j]; + } + else + { + ed = editDistance(l1[i-1], l2[j-1], max_edit); + } + + table[i*ncols+j] = min3(table[(i-1)*ncols + j-1] + ed, + table[(i-1)*ncols + j] + weight(l2[j-1]), + table[i*ncols + j-1] + weight(l1[i-1])); + } + } + + return table; +} + +wstring +TMXBuilder::filter(wstring const &tu) +{ + bool has_text = false; + unsigned int count_blank = 0; + + for(unsigned int i = 0, limit = tu.size(); i != limit; i++) + { + if(iswalpha(tu[i])) + { + has_text = true; + } + else if(has_text && iswspace(tu[i])) + { + count_blank++; + } + } + + if(!has_text || count_blank <= 2 || tu.size() == 0) + { + return L""; + } + + return xmlize(tu); +} + +void +TMXBuilder::printTU(FILE *output, wstring const &tu1, wstring const &tu2) const +{ + wstring tu1_filtered = filter(tu1); + wstring tu2_filtered = filter(tu2); + + if(tu1_filtered != L"" && tu2_filtered != L"") + { + + fprintf(output, "\n %s\n", + UtfConverter::toUtf8(lang1).c_str(), + UtfConverter::toUtf8(tu1_filtered).c_str()); + + fprintf(output, " %s\n\n", + UtfConverter::toUtf8(lang2).c_str(), + UtfConverter::toUtf8(tu2_filtered).c_str()); + } +} + +int +TMXBuilder::min3(int i1, int i2, int i3) +{ + if(i1 <= i2) + { + if(i1 <= i3) + { + return i1; + } + else + { + return i3; + } + } + else if(i2 <= i3) + { + return i2; + } + else + { + return i3; + } +} + +int +TMXBuilder::min2(int i1, int i2) +{ + if(i1 <= i2) + { + return i1; + } + else + { + return i2; + } +} + +int +TMXBuilder::editDistance(wstring const &s1, wstring const &s2, unsigned int max_edit) +{ + int const nrows = min2(s1.size() + 1, max_edit); + int const ncols = min2(s2.size() + 1, max_edit); + + int *table = new int[nrows*ncols]; + + table[0] = 0; + + for(int i = 1; i < nrows; i++) + { + table[i*ncols] = i; + } + + for(int j = 1; j < nrows; j++) + { + table[j] = j; + } + + for(int i = 1; i < nrows; i++) + { + for(int j = 1; j < ncols; j++) + { + int coste = 0; + if(s1[i-1] != s2[j-1]) + { + coste = 1; + } + + table[i*ncols+j] = min3(table[(i-1)*ncols+(j-1)]+coste, + table[(i-1)*ncols+j] + 2, + table[i*ncols+(j-1)] + 2); + } + } + int result = table[(nrows*ncols)-1]; + delete[] table; + return result; +} + +void +TMXBuilder::setMaxEdit(int me) +{ + max_edit = me; +} + +void +TMXBuilder::setDiagonalWidth(int dw) +{ + diagonal_width = dw; +} + +void +TMXBuilder::setWindowSize(int ws) +{ + window_size = ws; +} + +void +TMXBuilder::setStep(int s) +{ + step = s; +} + +void +TMXBuilder::setPercent(double p) +{ + percent = p; +} + +void +TMXBuilder::setLowLimit(int l) +{ + low_limit = l; +} + +void +TMXBuilder::setEditDistancePercent(double e) +{ + edit_distance_percent = e; +} + +bool +TMXBuilder::isRemovablePunct(wchar_t const &c) +{ + return c == L'.'; +} + +bool +TMXBuilder::similar(wstring const &s1, wstring const &s2) +{ + unsigned int l1 = s1.size(); + unsigned int l2 = s2.size(); + + if((l1 <= low_limit) && (l2 <= low_limit)) + { + return true; + } + else + { + int maxlength = max(l1, l2); + int minlength = min(l1, l2); + int ed = editDistance(s1, s2, maxlength); + + if(double(ed) < edit_distance_percent*double(maxlength)) + { + return double(minlength)/double(maxlength) > percent; + } + else + { + return false; + } + } +} + +void +TMXBuilder::setTranslation(string const &filename) +{ + freference = fopen(filename.c_str(), "r"); + if(!freference) + { + wcerr << L"Error: file '" << UtfConverter::fromUtf8(filename); + wcerr << L"' cannot be opened for reading" << endl; + freference = NULL; + } + +#ifdef _MSC_VER + if(freference != NULL) + { + _setmode(_fileno(freference), _O_U8TEXT); + } +#endif +} Index: branches/apertium-tagger/apertium2/apertium/tmx_translate.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_translate.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_translate.cc (revision 69632) @@ -0,0 +1,362 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#include + +#include +#include +#include + +#include +#include + +namespace TMXAligner +{ + +void buildDumbDictionary( const DictionaryItems& dictionary, DumbDictionary& dumbDictionary ) +{ + dumbDictionary.clear(); + + for (size_t i=0; i1 are incomparable. + + const Phrase& oldTrans = ft->second; + + // Shorter phrases are better than longer ones. + if (oldTrans.size()>en.size()) + { + overWrite = true; + } + + // More frequent words are better than less frequent ones. + if ( (oldTrans.size()==1) && (en.size()==1) ) + { + if ( enFreq[oldTrans[0]] < enFreq[en[0]] ) + { + overWrite = true; + } + } + } + else + { + overWrite = true; + } + + if (overWrite) + dumbDictionary[originalWord] = en ; + } + } +} + +void buildDumbDictionary( TMXAligner::DumbDictionary& dumbDictionary, + const std::string& dictionaryFilename, + const TMXAligner::SentenceList& enSentenceList + ) +{ + TMXAligner::DictionaryItems dictionary; + { + std::ifstream is( dictionaryFilename.c_str() ); + dictionary.read( is ); + std::cerr << dictionary.size() << " dictionary items read." << std::endl; + } + + if (!enSentenceList.empty()) + { + TMXAligner::FrequencyMap enFreq; + enFreq.build(enSentenceList); + TMXAligner::buildDumbDictionaryUsingFrequencies( dictionary, enFreq, dumbDictionary ); + } + else + { + TMXAligner::buildDumbDictionary( dictionary, dumbDictionary ); + } +} + +void trivialTranslateWord( + const DumbDictionary& dumbDictionary, + const Word& originalWord, + Phrase& words + ) +{ + words.clear(); + + DumbDictionary::const_iterator ft = dumbDictionary.find(originalWord); + if (ft!=dumbDictionary.end()) + { + words = ft->second; + } + else + { + bool leaveAsItis(false); + + // This worsens the score for the 1984 corpus, most possibly because of the false cognates a(a), is(is), van(van). + bool alwaysLeaveAsItis = true; + if (alwaysLeaveAsItis) + { + leaveAsItis = true; + } + + if ( !leaveAsItis && (originalWord[0]>='A') && (originalWord[0]<='Z') ) + { + leaveAsItis = true; + } + + if (!leaveAsItis) + { + bool isNumber(true); + for ( size_t k=0; k'9') ) ) + { + isNumber = false; + break; + } + } + + if (isNumber) + { + leaveAsItis = true; + } + } + + if (leaveAsItis) + { + words.push_back(originalWord); + } + } +} + +void trivialTranslate( + const DumbDictionary& dumbDictionary, + const Sentence& sentence, + Sentence& translatedSentence + ) +{ + bool logging = false; + + std::ofstream* translateLogsPtr = 0; + if (logging) + { + translateLogsPtr = new std::ofstream( "/dev/null", std::ios::app ); + } + std::ostream& logs = translateLogsPtr ? *translateLogsPtr : std::cout ; + + translatedSentence.id = sentence.id; + Phrase& words = translatedSentence.words; + + if (logging && !translatedSentence.id.empty()) + logs << translatedSentence.id << "\t"; + + const Phrase& originalWords = sentence.words; + + for ( size_t j=0; j subsetLookup; + { + for ( size_t i=0; i results; + subsetLookup.lookup( sentenceList[i].words, results ); + + for ( std::set::const_iterator it=results.begin(); it!=results.end(); ++it ) + { + const Phrase& phrase = dictionary[*it-1].first; // !!! i-1 + + for ( size_t i=0; i. + */ +#ifndef _TRANSFERINSTR_ +#define _TRANSFERINSTR_ + +#include + +using namespace std; + +enum TransferInstrType +{ + ti_clip_sl, + ti_clip_tl, + ti_var, + ti_lit_tag, + ti_lit, + ti_b, + ti_get_case_from, + ti_case_of_sl, + ti_case_of_tl, + ti_linkto_sl, + ti_linkto_tl, + ti_lu_count +}; + +class TransferInstr +{ +private: + TransferInstrType type; + string content; + int pos; + void *pointer; + bool condition; + + void copy(TransferInstr const &o); + void destroy(); +public: + TransferInstr() : + type(ti_clip_sl), + pos(0), + pointer(0), + condition(false) + {} + TransferInstr(TransferInstrType t, string const &c, int const p, + void *ptr=NULL, bool cond = true); + ~TransferInstr(); + TransferInstr(TransferInstr const &o); + TransferInstr & operator =(TransferInstr const &o); + + + TransferInstrType getType(); + string const & getContent(); + int getPos(); + void * getPointer(); + bool getCondition(); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/transfer_mult.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_mult.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_mult.cc (revision 69632) @@ -0,0 +1,514 @@ +/* + * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + +void +TransferMult::destroy() +{ + if(me) + { + delete me; + me = NULL; + } +} + +TransferMult::TransferMult() : +word(0), +blank(0), +output(0), +any_char(0), +any_tag(0), +nwords(0) +{ + me = NULL; + isRule = false; + defaultAttrs = lu; + numwords = 0; +} + +TransferMult::~TransferMult() +{ + destroy(); +} + +string +TransferMult::tolower(string const &str) const +{ + string result = str; + for(unsigned int i = 0, limit = str.size(); i != limit; i++) + { + result[i] = ::tolower(result[i]); + } + + return result; +} + +void +TransferMult::readData(FILE *in) +{ + alphabet.read(in); + any_char = alphabet(TRXReader::ANY_CHAR); + any_tag = alphabet(TRXReader::ANY_TAG); + + Transducer t; + t.read(in, alphabet.size()); + + map finals; + + // finals + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + int key = Compression::multibyte_read(in); + finals[key] = Compression::multibyte_read(in); + } + + me = new MatchExe(t, finals); + + // attr_items + bool recompile_attrs = Compression::string_read(in) != string(pcre_version()); + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + attr_items[cad_k].read(in); + wstring fallback = Compression::wstring_read(in); + if(recompile_attrs) { + attr_items[cad_k].compile(UtfConverter::toUtf8(fallback)); + } + } + + // variables + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + variables[cad_k] = UtfConverter::toUtf8(Compression::wstring_read(in)); + } + + // macros + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + macros[cad_k] = Compression::multibyte_read(in); + } + + // lists + for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++) + { + string const cad_k = UtfConverter::toUtf8(Compression::wstring_read(in)); + + for(int j = 0, limit2 = Compression::multibyte_read(in); j != limit2; j++) + { + wstring const cad_v = Compression::wstring_read(in); + lists[cad_k].insert(UtfConverter::toUtf8(cad_v)); + listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v))); + } + } +} + +void +TransferMult::readBil(string const &fstfile) +{ + FILE *in = fopen(fstfile.c_str(), "r"); + if(!in) + { + cerr << "Error: Could not open file '" << fstfile << "'." << endl; + exit(EXIT_FAILURE); + } + fstp.load(in); + fstp.initBiltrans(); + fclose(in); +} + +void +TransferMult::read(string const &datafile, string const &fstfile) +{ + // datafile + FILE *in = fopen(datafile.c_str(), "r"); + if(!in) + { + cerr << "Error: Could not open file '" << datafile << "'." << endl; + exit(EXIT_FAILURE); + } + readData(in); + fclose(in); + + readBil(fstfile); +} + +TransferToken & +TransferMult::readToken(FILE *in) +{ + if(!input_buffer.isEmpty()) + { + return input_buffer.next(); + } + + wstring content = L""; + while(true) + { + int val = fgetwc_unlocked(in); + if(feof(in)) + { + return input_buffer.add(TransferToken(content, tt_eof)); + } + if(val == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val == L'[') + { + content += L'['; + while(true) + { + int val2 = fgetwc_unlocked(in); + if(val2 == L'\\') + { + content += L'\\'; + content += wchar_t(fgetwc_unlocked(in)); + } + else if(val2 == L']') + { + content += L']'; + break; + } + else + { + content += wchar_t(val2); + } + } + } + else if(val == L'$') + { + return input_buffer.add(TransferToken(content, tt_word)); + } + else if(val == L'^') + { + return input_buffer.add(TransferToken(content, tt_blank)); + } + else + { + content += wchar_t(val); + } + } +} + +void +TransferMult::transfer(FILE *in, FILE *out) +{ + int last = 0; + + output = out; + ms.init(me->getInitial()); + + while(true) + { + if(ms.size() == 0) + { + if(isRule) + { + applyRule(); + isRule = false; + input_buffer.setPos(last); + } + else + { + if(tmpword.size() != 0) + { + pair tr = fstp.biltransWithQueue(*tmpword[0], false); + if(tr.first.size() != 0) + { + vector multiword = acceptions(tr.first); + if(multiword.size() > 1) + { + fputws_unlocked(L"[{]", output); + } + for(unsigned int i = 0, limit = multiword.size(); i != limit; i++) + { + if(i > 0) + { + fputws_unlocked(L"[|]", output); + } + fputwc_unlocked(L'^', output); + fputws_unlocked(multiword[i].c_str(), output); + fputwc_unlocked(L'$', output); + } + if(multiword.size() > 1) + { + fputws_unlocked(L".[][}]", output); + } + } + tmpword.clear(); + isRule = false; + input_buffer.setPos(last); + input_buffer.next(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + else if(tmpblank.size() != 0) + { + fputws_unlocked(tmpblank[0]->c_str(), output); + tmpblank.clear(); + last = input_buffer.getPos(); + ms.init(me->getInitial()); + } + } + } + int val = ms.classifyFinals(me->getFinals()); + if(val != -1) + { + isRule = true; + numwords = tmpword.size(); + last = input_buffer.getPos(); + } + + TransferToken ¤t = readToken(in); + + switch(current.getType()) + { + case tt_word: + applyWord(current.getContent()); + tmpword.push_back(¤t.getContent()); + break; + + case tt_blank: + ms.step(L' '); + tmpblank.push_back(¤t.getContent()); + break; + + case tt_eof: + if(tmpword.size() != 0) + { + tmpblank.push_back(¤t.getContent()); + ms.clear(); + } + else + { + fputws_unlocked(current.getContent().c_str(), output); + return; + } + break; + + default: + wcerr << L"Error: Unknown input token." << endl; + return; + } + } +} + +bool +TransferMult::isDefaultWord(wstring const &str) +{ + return str.find(L" D<"); +} + +vector +TransferMult::acceptions(wstring str) +{ + vector result; + int low = 0; + + // removing '@' + if(str[0] == L'@') + { + str = str.substr(1); + } + + for(unsigned int i = 0, limit = str.size(); i != limit; i++) + { + if(str[i] == L'\\') + { + i++; + } + else if(str[i] == L'/') + { + wstring new_word = str.substr(low, i-low); + + if(result.size() > 1 && isDefaultWord(new_word)) + { + result.push_back(result[0]); + result[0] = new_word; + } + else + { + result.push_back(new_word); + } + low = i + 1; + } + } + + wstring otherword = str.substr(low); + if(result.size() > 0 && isDefaultWord(otherword)) + { + result.push_back(result[0]); + result[0] = otherword; + } + else + { + result.push_back(otherword); + } + + // eliminar las acepciones sin sentido marcado + if(result.size() >= 2) + { + vector result2; + for(unsigned int i = 0, limit = result.size(); i != limit; i++) + { + if(result[i].find(L"__") != wstring::npos) + { + result2.push_back(result[i]); + } + } + if(result2.size() >= 2) + { + return result2; + } + } + + return result; +} + +void +TransferMult::writeMultiple(list >::iterator itwords, + list::iterator itblanks, + list >::const_iterator limitwords, + wstring acum , bool multiple) +{ + if(itwords == limitwords) + { + if(multiple) + { + output_string.append(L"[|]"); + } + output_string.append(acum); + } + else + { + vector &refword = *itwords; + + itwords++; + + if(itwords == limitwords) + { + for(unsigned int i = 0, limit = refword.size(); i != limit; i++) + { + writeMultiple(itwords, itblanks, limitwords, + acum + L"^" + refword[i] + L"$", multiple || (i > 0)); + } + } + else + { + wstring &refblank = *itblanks; + itblanks++; + + for(unsigned int i = 0, limit = refword.size(); i != limit; i++) + { + writeMultiple(itwords, itblanks, limitwords, + acum + L"^" + refword[i] + L"$" + refblank, + multiple || (i > 0)); + } + } + } +} + +void +TransferMult::applyRule() +{ + list blanks; + list > words; + + pair tr = fstp.biltransWithQueue(*tmpword[0], false); + words.push_back(acceptions(tr.first)); + + for(unsigned int i = 1; i != numwords; i++) + { + blanks.push_back(*tmpblank[i-1]); + pair tr = fstp.biltransWithQueue(*tmpword[i], false); + words.push_back(acceptions(tr.first)); + } + + output_string = L""; + writeMultiple(words.begin(), blanks.begin(), words.end()); + + if(output_string.find(L"[|]") != wstring::npos) + { + fputws_unlocked(L"[{]", output); + fputws_unlocked(output_string.c_str(), output); + fputws_unlocked(L".[][}]", output); + } + else + { + fputws_unlocked(output_string.c_str(), output); + } + + ms.init(me->getInitial()); + + tmpblank.clear(); + tmpword.clear(); + numwords = 0; +} + +void +TransferMult::applyWord(wstring const &word_str) +{ + ms.step(L'^'); + for(unsigned int i = 0, limit = word_str.size(); i < limit; i++) + { + switch(word_str[i]) + { + case L'\\': + i++; + ms.step(towlower(word_str[i]), any_char); + break; + + case L'<': + for(unsigned int j = i+1; j != limit; j++) + { + if(word_str[j] == L'>') + { + int symbol = alphabet(word_str.substr(i, j-i+1)); + if(symbol) + { + ms.step(symbol, any_tag); + } + else + { + ms.step(any_tag); + } + i = j; + break; + } + } + break; + + default: + ms.step(towlower(word_str[i]), any_char); + break; + } + } + ms.step(L'$'); +} Index: branches/apertium-tagger/apertium2/apertium/transfer_mult.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_mult.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_mult.h (revision 69632) @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TRANSFER_MULT_ +#define _TRANSFER_MULT_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace std; + +class TransferMult +{ +private: + + Alphabet alphabet; + MatchExe *me; + MatchState ms; + map attr_items; + map variables; + map macros; + map, Ltstr> lists; + map, Ltstr> listslow; + TransferWord **word; + string **blank; + Buffer input_buffer; + vector tmpword; + vector tmpblank; + wstring output_string; + + FSTProcessor fstp; + FILE *output; + int any_char; + int any_tag; + bool isRule; + unsigned int numwords; + + unsigned int nwords; + + enum OutputType{lu,chunk}; + + OutputType defaultAttrs; + + void destroy(); + void readData(FILE *input); + void readBil(string const &filename); + string caseOf(string const &str); + string copycase(string const &source_word, string const &target_word); + + bool beginsWith(string const &str1, string const &str2) const; + bool endsWith(string const &str1, string const &str2) const; + string tolower(string const &str) const; + string tags(string const &str) const; + wstring readWord(FILE *in); + wstring readBlank(FILE *in); + wstring readUntil(FILE *in, int const symbol) const; + void applyWord(wstring const &word_str); + void applyRule(); + TransferToken & readToken(FILE *in); + void writeMultiple(list >::iterator itwords, + list::iterator itblanks, + list >::const_iterator limitwords, + wstring acum = L"", bool multiple = false); + vector acceptions(wstring str); + bool isDefaultWord(wstring const &str); +public: + TransferMult(); + ~TransferMult(); + + void read(string const &datafile, string const &fstfile); + void transfer(FILE *in, FILE *out); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/transfer_token.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_token.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_token.cc (revision 69632) @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include + +using namespace Apertium; + +void +TransferToken::copy(TransferToken const &o) +{ + type = o.type; + content = o.content; +} + +void +TransferToken::destroy() +{ +} + +TransferToken::TransferToken() : +type(tt_eof) +{ +} + +TransferToken::TransferToken(wstring const &content, + TransferTokenType type) +{ + this->content = content; + this->type = type; +} + +TransferToken::~TransferToken() +{ + destroy(); +} + +TransferToken::TransferToken(TransferToken const &o) +{ + copy(o); +} + +TransferToken & +TransferToken::operator =(TransferToken const &o) +{ + if(this != &o) + { + destroy(); + copy(o); + } + return *this; +} + +TransferTokenType +TransferToken::getType() +{ + return type; +} + +wstring & +TransferToken::getContent() +{ + return content; +} + +void +TransferToken::setType(TransferTokenType type) +{ + this->type = type; +} + +void +TransferToken::setContent(wstring const &content) +{ + this->content = content; +} + Index: branches/apertium-tagger/apertium2/apertium/transfer_word.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_word.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_word.cc (revision 69632) @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include + +using namespace Apertium; +void +TransferWord::copy(TransferWord const &o) +{ + s_str = o.s_str; + t_str = o.t_str; + queue_length = o.queue_length; +} + +void +TransferWord::destroy() +{ +} + +TransferWord::TransferWord() : +queue_length(0) +{ +} + +TransferWord::TransferWord(string const &src, string const &tgt, int queue) +{ + init(src, tgt); + queue_length = queue; +} + +TransferWord::~TransferWord() +{ + destroy(); +} + +TransferWord::TransferWord(TransferWord const &o) +{ + copy(o); +} + +TransferWord & +TransferWord::operator =(TransferWord const &o) +{ + if(this != &o) + { + destroy(); + copy(o); + } + return *this; +} + +void +TransferWord::init(string const &src, string const &tgt) +{ + s_str = src; + t_str = tgt; +} + +string +TransferWord::source(ApertiumRE const &part, bool with_queue) +{ + if(with_queue) + { + return part.match(s_str); + } + else + { + return part.match(s_str.substr(0, s_str.size() - queue_length)); + } +} + +string +TransferWord::target(ApertiumRE const &part, bool with_queue) +{ + if(with_queue) + { + return part.match(t_str); + } + else + { + return part.match(t_str.substr(0, t_str.size() - queue_length)); + } +} + +void +TransferWord::setSource(ApertiumRE const &part, string const &value, + bool with_queue) +{ + if(with_queue) + { + part.replace(s_str, value); + } + else + { + string mystring = s_str.substr(0, s_str.size() - queue_length); + part.replace(mystring, value); + s_str = mystring + s_str.substr(s_str.size() - queue_length); + } +} + +void +TransferWord::setTarget(ApertiumRE const &part, string const &value, + bool with_queue) +{ + if(with_queue) + { + part.replace(t_str, value); + } + else + { + string mystring = t_str.substr(0, t_str.size() - queue_length); + part.replace(mystring, value); + t_str = mystring + t_str.substr(t_str.size() - queue_length); + } +} Index: branches/apertium-tagger/apertium2/apertium/trx_reader.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/trx_reader.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/trx_reader.cc (revision 69632) @@ -0,0 +1,631 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include + +#include +#include +#include + +using namespace Apertium; +wstring const +TRXReader::ANY_TAG = L""; + +wstring const +TRXReader::ANY_CHAR = L""; + +void +TRXReader::destroy() +{ + xmlFreeTextReader(reader); +} + +TRXReader::TRXReader() : +reader(0), +type(0) +{ + td.getAlphabet().includeSymbol(ANY_TAG); + td.getAlphabet().includeSymbol(ANY_CHAR); +} + +TRXReader::~TRXReader() +{ + destroy(); +} + +void +TRXReader::step() +{ + int retval = xmlTextReaderRead(reader); + if(retval != 1) + { + parseError(L"unexpected EOF"); + } + name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + type = xmlTextReaderNodeType(reader); +} + +wstring +TRXReader::attrib(wstring const &name) +{ + return XMLParseUtil::attrib(reader, name); +} + +void +TRXReader::parseError(wstring const &message) +{ + wcerr << L"Error: (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): " << message << L"." << endl; + exit(EXIT_FAILURE); +} + +int +TRXReader::insertLemma(int const base, wstring const &lemma) +{ + int retval = base; + static int const any_char = td.getAlphabet()(ANY_CHAR); + if(lemma == L"") + { + retval = td.getTransducer().insertSingleTransduction(any_char, retval); + td.getTransducer().linkStates(retval, retval, any_char); + int another = td.getTransducer().insertSingleTransduction(L'\\', retval); + td.getTransducer().linkStates(another, retval, any_char); + } + else + { + for(unsigned int i = 0, limit = lemma.size(); i != limit; i++) + { + if(lemma[i] == L'\\') + { + retval = td.getTransducer().insertSingleTransduction(L'\\', retval); + i++; + retval = td.getTransducer().insertSingleTransduction(int(lemma[i]), + retval); + } + else if(lemma[i] == L'*') + { + retval = td.getTransducer().insertSingleTransduction(any_char, retval); + td.getTransducer().linkStates(retval, retval, any_char); + } + else + { + retval = td.getTransducer().insertSingleTransduction(int(lemma[i]), + retval); + } + } + } + + return retval; +} + +int +TRXReader::insertTags(int const base, wstring const &tags) +{ + int retval = base; + static int const any_tag = td.getAlphabet()(ANY_TAG); + if(tags.size() != 0) + { + for(unsigned int i = 0, limit = tags.size(); i < limit; i++) + { + if(tags[i] == L'*') + { + retval = td.getTransducer().insertSingleTransduction(any_tag, retval); + td.getTransducer().linkStates(retval, retval, any_tag); + i++; + } + else + { + wstring symbol = L"<"; + for(unsigned int j = i; j != limit; j++) + { + if(tags[j] == L'.') + { + symbol.append(tags.substr(i, j-i)); + i = j; + break; + } + } + + if(symbol == L"<") + { + symbol.append(tags.substr(i)); + i = limit; + } + symbol += L'>'; + td.getAlphabet().includeSymbol(symbol); + retval = td.getTransducer().insertSingleTransduction(td.getAlphabet()(symbol), retval); + } + } + } + else + { + return base; // new line + } + + return retval; +} + +void +TRXReader::read(string const &filename) +{ + reader = xmlReaderForFile(filename.c_str(), NULL, 0); + if(reader == NULL) + { + cerr << "Error: Cannot open '" << filename << "'." << endl; + exit(EXIT_FAILURE); + } + + procDefCats(); + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + + if(name == L"section-def-attrs") + { + procDefAttrs(); + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + } + + if(name == L"section-def-vars") + { + procDefVars(); + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + } + + if(name == L"section-def-lists") + { + procDefLists(); + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + } + + if(name == L"section-def-macros") + { + procDefMacros(); + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + } + + if(name == L"section-rules") + { + procRules(); + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + } +} + +void +TRXReader::procRules() +{ + int count = 0; + set alive_states; + + while(type != XML_READER_TYPE_END_ELEMENT || + name != L"section-rules") + { + step(); + if(name == L"rule") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + count++; + } + } + else if(name == L"pattern") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + alive_states.clear(); + alive_states.insert(td.getTransducer().getInitial()); + } + else + { + for(set::iterator it = alive_states.begin(), limit = alive_states.end(); + it != limit; it++) + { + td.getTransducer().setFinal(*it); + if(td.getFinals().find(*it) == td.getFinals().end()) + { + td.getFinals()[*it] = count; + } + else + { + wcerr << L"Warning (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): " + << L"Paths to rule " << count << " blocked by rule " << td.getFinals()[*it] + << L"." << endl; + + } + } + } + } + else if(name == L"pattern-item") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + pair::iterator, + multimap::iterator> range; + + range = cat_items.equal_range(attrib(L"n")); + + if(range.first == range.second) + { + parseError(L"Undefined cat-item '" + attrib(L"n")); + } + +// new code + + set alive_states_new; + + for(; range.first != range.second; range.first++) + { + for(set::iterator it = alive_states.begin(), limit = alive_states.end(); + it != limit; it++) + { + // mark of begin of word + int tmp = td.getTransducer().insertSingleTransduction(L'^', *it); + if(*it != td.getTransducer().getInitial()) + { + // insert optional blank between two words + int alt = td.getTransducer().insertSingleTransduction(L' ', *it); + td.getTransducer().linkStates(alt, tmp, L'^'); + } + + // insert word + tmp = insertLemma(tmp, range.first->second.lemma); + tmp = insertTags(tmp, range.first->second.tags); + + // insert mark of end of word + tmp = td.getTransducer().insertSingleTransduction(L'$', tmp); + + // set as alive_state + alive_states_new.insert(tmp); + } + } + + // copy new alive states on alive_states set + alive_states = alive_states_new; + } + } + else if(name == L"let") + { + int count = 0; + int lineno = xmlTextReaderGetParserLineNumber(reader); + while(name != L"let" || type != XML_READER_TYPE_END_ELEMENT) + { + step(); + if(type == XML_ELEMENT_NODE) + { + count++; + + if(name == L"clip" && attrib(L"side") == L"sl") + { + wcerr << L"Warning (" << lineno; + wcerr << L"): assignment to 'sl' side has no effect." << endl; + } + } + + if(count != 0) + { + break; + } + } + + } + } +} + +void +TRXReader::write(string const &filename) +{ + FILE *out = fopen(filename.c_str(), "wb"); + if(!out) + { + cerr << "Error: cannot open '" << filename; + cerr << "' for writing" << endl; + exit(EXIT_FAILURE); + } + + td.write(out); + + fclose(out); +} + +void +TRXReader::procDefAttrs() +{ + wstring attrname; + + while(type != XML_READER_TYPE_END_ELEMENT || + name != L"section-def-attrs") + { + step(); + if(name == L"attr-item") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + insertAttrItem(attrname, attrib(L"tags")); + } + } + else if(name == L"def-attr") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + attrname = attrib(L"n"); + } + else + { + wstring all = td.getAttrItems()[attrname]; + td.getAttrItems()[attrname] = L"(" + all + L")"; + attrname = L""; + } + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"section-def-attrs") + { + // do nothing + } + else + { + parseError(L"Unexpected '<" + name + L">' tag"); + } + } +} + +void +TRXReader::procDefCats() +{ + while(type == XML_READER_TYPE_END_ELEMENT || !(name == L"transfer" || name == L"interchunk" || name == L"postchunk")) + { + step(); + if(name != L"#text" && name != L"transfer" && name != L"interchunk" && + name != L"postchunk" && name != L"section-def-cats" && name != L"#comment") + { + parseError(L"'<" + name + L">' tag unexpected"); + } + } + + wstring catname; + + while(type != XML_READER_TYPE_END_ELEMENT || + name != L"section-def-cats") + { + step(); + if(name == L"cat-item") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + if(attrib(L"tags") != L"") + { + insertCatItem(catname, attrib(L"lemma"), attrib(L"tags")); + } + else + { + insertCatItem(catname, attrib(L"name"), L""); + } + } + } + else if(name == L"def-cat") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + catname = attrib(L"n"); + } + else + { + catname = L""; + } + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"section-def-cats") + { + // do nothing + } + else + { + parseError(L"Unexpected '<" + name + L">' tag"); + } + } +} + +void +TRXReader::procDefVars() +{ + while(type != XML_READER_TYPE_END_ELEMENT || + name != L"section-def-vars") + { + step(); + if(name == L"def-var") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + createVar(attrib(L"n"), attrib(L"v")); + } + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"section-def-vars") + { + // do nothing + } + else + { + parseError(L"Unexpected '<" + name + L">' tag"); + } + } +} + +void +TRXReader::procDefLists() +{ + wstring listname; + + while(type != XML_READER_TYPE_END_ELEMENT || + name != L"section-def-lists") + { + step(); + if(name == L"list-item") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + insertListItem(listname, attrib(L"v")); + } + } + else if(name == L"def-list") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + listname = attrib(L"n"); + } + else + { + listname = L""; + } + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"section-def-lists") + { + // do nothing + } + else + { + parseError(L"Unexpected '<" + name + L">' tag"); + } + } +} + +void +TRXReader::procDefMacros() +{ + int count = 0; + while(type != XML_READER_TYPE_END_ELEMENT || + name != L"section-def-macros") + { + step(); + if(name == L"def-macro") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + createMacro(attrib(L"n"), count++); + } + } + } +} + +void +TRXReader::createMacro(wstring const &name, int const value) +{ + if(td.getMacros().find(name) != td.getMacros().end()) + { + parseError(L"Macro '" + name + L"' defined at least twice"); + } + td.getMacros()[name] = value; +} + +void +TRXReader::insertListItem(wstring const &name, wstring const &value) +{ + td.getLists()[name].insert(value); +} + +void +TRXReader::createVar(wstring const &name, wstring const &initial_value) +{ + td.getVariables()[name] = initial_value; +} + +void +TRXReader::insertCatItem(wstring const &name, wstring const &lemma, + wstring const &tags) +{ + LemmaTags lt; + lt.lemma = lemma; + lt.tags = tags; + cat_items.insert(pair(name, lt)); +} + +void +TRXReader::insertAttrItem(wstring const &name, wstring const &tags) +{ + if(td.getAttrItems()[name].size() != 0) + { + td.getAttrItems()[name] += L'|'; + } + + td.getAttrItems()[name] += '<'; + + for(unsigned int i = 0, limit = tags.size(); i != limit; i++) + { + if(tags[i] == L'.') + { + td.getAttrItems()[name].append(L"><"); + } + else + { + td.getAttrItems()[name] += tags[i]; + } + } + td.getAttrItems()[name] += L'>'; + +} Index: branches/apertium-tagger/apertium2/apertium/trx_reader.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/trx_reader.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/trx_reader.h (revision 69632) @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TRXREADER_ +#define _TRXREADER_ + +#include +#include + +#include +#include +#include + +using namespace std; + +class TRXReader +{ +private: + struct LemmaTags + { + wstring lemma; + wstring tags; + }; + + xmlTextReaderPtr reader; + + int type; + wstring name; + + multimap cat_items; + TransferData td; + + wstring attrib(wstring const &name); + + void parseError(wstring const &message); + void destroy(); + void clearTagIndex(); + + void step(); + void procTransfer(); + void procDefCats(); + void procDefAttrs(); + void procDefVars(); + void procDefLists(); + void procDefMacros(); + void procRules(); + + void insertCatItem(wstring const &name, wstring const &lemma, + wstring const &tags); + void insertAttrItem(wstring const &name, wstring const &tags); + void createVar(wstring const &name, wstring const &initial_value); + void insertListItem(wstring const &name, wstring const &value); + void createMacro(wstring const &name, int const val); + + int insertLemma(int const base, wstring const &lemma); + int insertTags(int const base, wstring const &tags); + +public: + static wstring const ANY_TAG; + static wstring const ANY_CHAR; + + + TRXReader(); + ~TRXReader(); + + void read(string const &filename); + void write(string const &filename); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/tsx_reader.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tsx_reader.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tsx_reader.cc (revision 69632) @@ -0,0 +1,596 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include + +#include +#include +#include + +using namespace Apertium; +void +TSXReader::copy(TSXReader const &o) +{ +} + +void +TSXReader::destroy() +{ +} + +TSXReader::TSXReader() : +reader(0), +type(0) +{ + open_class = &(tdata.getOpenClass()); + forbid_rules = &(tdata.getForbidRules()); + tag_index = &(tdata.getTagIndex()); + array_tags = &(tdata.getArrayTags()); + enforce_rules = &(tdata.getEnforceRules()); + prefer_rules = &(tdata.getPreferRules()); + plist = &(tdata.getPatternList()); + constants = &(tdata.getConstants()); +} + +TSXReader::~TSXReader() +{ + destroy(); +} + +TSXReader::TSXReader(TSXReader const &o) +{ + copy(o); +} + + +void +TSXReader::clearTagIndex() +{ + tag_index->clear(); + array_tags->clear(); + newTagIndex(L"LPAR"); + newTagIndex(L"RPAR"); + newTagIndex(L"LQUEST"); + newTagIndex(L"CM"); + newTagIndex(L"SENT"); + newTagIndex(L"kEOF"); + newTagIndex(L"kUNDEF"); +} + +void +TSXReader::step() +{ + int retval = xmlTextReaderRead(reader); + if(retval != 1) + { + parseError(L"unexpected EOF"); + } + name = XMLParseUtil::towstring(xmlTextReaderConstName(reader)); + type = xmlTextReaderNodeType(reader); +} + +TSXReader & +TSXReader::operator =(TSXReader const &o) +{ + if(this != &o) + { + destroy(); + copy(o); + } + return *this; +} + +wstring +TSXReader::attrib(wstring const &name) +{ + return XMLParseUtil::attrib(reader, name); +} + +void +TSXReader::parseError(wstring const &message) +{ + wcerr << L"Error: (" << xmlTextReaderGetParserLineNumber(reader); + wcerr << L"): " << message << L"." << endl; + exit(EXIT_FAILURE); +} + +void +TSXReader::newTagIndex(wstring const &tag) +{ + if(tag_index->find(L"TAG_" + tag) != tag_index->end()) + { + parseError(L"'" + tag + L"' already defined"); + } + + array_tags->push_back(L"TAG_" + tag); + (*tag_index)[L"TAG_" + tag] = array_tags->size() - 1; +} + +void +TSXReader::newDefTag(wstring const &tag) +{ + if(tag_index->find(L"TAG_" + tag) != tag_index->end()) + { + parseError(L"'" + tag + L"' already defined"); + } + + array_tags->push_back(tag); + (*tag_index)[L"TAG_" + tag] = array_tags->size() - 1; +} + +void +TSXReader::newConstant(wstring const &constant) +{ + constants->setConstant(constant, array_tags->size()); + array_tags->push_back(constant); +} + +void +TSXReader::procDiscardOnAmbiguity() +{ + while(type != XML_READER_TYPE_END_ELEMENT || name != L"discard-on-ambiguity") + { + step(); + + if(name == L"discard") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + tdata.addDiscard(L"<" + StringUtils::substitute(attrib(L"tags"), L".", L"><") + L">"); + } + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"discard-on-ambiguity") + { + if(type == XML_READER_TYPE_END_ELEMENT) + { + break; + } + else + { + parseError(L"Unexpected 'discard-on-ambiguity' open tag"); + } + } + else + { + parseError(L"unexpected '<" + name + L">' tag"); + } + } +} + +void +TSXReader::procDefLabel() +{ + wstring name_attr = attrib(L"name"); + wstring closed_attr = attrib(L"closed"); + newDefTag(name_attr); + + if(closed_attr != L"true") + { + open_class->insert((*tag_index)[L"TAG_"+name_attr]); + } + + while(type != XML_READER_TYPE_END_ELEMENT || name != L"def-label") + { + step(); + + if(name == L"tags-item") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + plist->insert((*tag_index)[L"TAG_"+name_attr], attrib(L"lemma"), + attrib(L"tags")); + } + } + else if(name == L"def-label") + { + return; + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else + { + parseError(L"unexpected '<" + name + L">' tag"); + } + } +} + +void +TSXReader::procDefMult() +{ + wstring name_attr = attrib(L"name"); + wstring closed_attr = attrib(L"closed"); + newDefTag(name_attr); + if(closed_attr != L"true") + { + open_class->insert((*tag_index)[L"TAG_"+name_attr]); + } + + while(type != XML_READER_TYPE_END_ELEMENT || name != L"def-mult") + { + step(); + if(name == L"sequence") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + plist->beginSequence(); + while(type != XML_READER_TYPE_END_ELEMENT || name != L"sequence") + { + step(); + if(name == L"label-item") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + plist->insert((*tag_index)[L"TAG_"+name_attr], + (*tag_index)[L"TAG_"+attrib(L"label")]); + } + } + else if(name == L"tags-item") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + plist->insert((*tag_index)[L"TAG_"+name_attr], + attrib(L"lemma"), attrib(L"tags")); + } + } + else if(name == L"sequence") + { + break; + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + } + plist->endSequence(); + } + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"def-mult") + { + // do nothing + } + else + { + parseError(L"unexpected '<" + name + L">' tag"); + } + } +} + +void +TSXReader::procTagset() +{ + while(type == XML_READER_TYPE_END_ELEMENT || name != L"tagset") + { + step(); + if(name != L"#text" && name != L"tagger" && name != L"tagset") + { + parseError(L"'<" + name + L">' tag unexpected"); + } + } + + while(type != XML_READER_TYPE_END_ELEMENT || name != L"tagset") + { + step(); + if(name == L"def-label") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + procDefLabel(); + } + } + else if(name == L"def-mult") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + procDefMult(); + } + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"tagset") + { + // do nothing + } + else + { + parseError(L"Unexpected '<" + name + L">' tag"); + } + } +} + + +void +TSXReader::procLabelSequence() +{ + TForbidRule forbid_rule; + + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + if(name != L"label-item") + { + parseError(L" tag expected"); + } + + forbid_rule.tagi = (*tag_index)[L"TAG_" + attrib(L"label")]; + + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + if(name != L"label-item") + { + parseError(L" tag expected"); + } + forbid_rule.tagj = (*tag_index)[L"TAG_" + attrib(L"label")]; + + forbid_rules->push_back(forbid_rule); +} + +void +TSXReader::procForbid() +{ + while(type != XML_READER_TYPE_END_ELEMENT || name != L"forbid") + { + step(); + if(name == L"label-sequence") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + procLabelSequence(); + } + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"forbid") + { + if(type == XML_READER_TYPE_END_ELEMENT) + { + break; + } + else + { + parseError(L"Unexpected '" + name + L"' open tag"); + } + } + else + { + parseError(L"Unexpected '" + name + L"' tag"); + } + } +} + +void +TSXReader::procEnforce() +{ + TEnforceAfterRule aux; + while(type != XML_READER_TYPE_END_ELEMENT || name != L"enforce-rules") + { + step(); + if(name == L"enforce-after") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + aux.tagi = (*tag_index)[L"TAG_" + attrib(L"label")]; + } + else + { + enforce_rules->push_back(aux); + aux.tagsj.clear(); + } + } + else if(name == L"label-set") + { + // do nothing + } + else if(name == L"label-item") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + aux.tagsj.push_back((*tag_index)[L"TAG_" + attrib(L"label")]); + } + } + else if(name == L"#text") + { + // do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"enforce-rules") + { + if(type == XML_READER_TYPE_END_ELEMENT) + { + break; + } + else + { + parseError(L"Unexpected 'enforce-rules' open tag"); + } + } + else + { + parseError(L"Unexpected '" + name + L"' tag"); + } + } +} + +void +TSXReader::procPreferences() +{ + while(type != XML_READER_TYPE_END_ELEMENT || name != L"preferences") + { + step(); + if(name == L"prefer") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + wstring const tags = L"<" + StringUtils::substitute(attrib(L"tags"), L".", L"><") + L">"; + prefer_rules->push_back(tags); + } + } + else if(name == L"#text") + { + //do nothing + } + else if(name == L"#comment") + { + // do nothing + } + else if(name == L"preferences") + { + if(type == XML_READER_TYPE_END_ELEMENT) + { + break; + } + else + { + parseError(L"Unexpected 'preferences' open tag"); + } + } + else + { + parseError(L"Unexpected '" + name + L"' tag"); + } + } +} + +void +TSXReader::read(string const &filename) +{ + reader = xmlReaderForFile(filename.c_str(), NULL, 0); + if(reader == NULL) + { + cerr << "Error: Cannot open '" << filename << "'." << endl; + exit(EXIT_FAILURE); + } + + open_class->clear(); + forbid_rules->clear(); + clearTagIndex(); + enforce_rules->clear(); + + procTagset(); + + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + if(name == L"forbid") + { + procForbid(); + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + } + if(name == L"enforce-rules") + { + procEnforce(); + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + } + if(name == L"preferences") + { + procPreferences(); + step(); + while(name == L"#text" || name == L"#comment") + { + step(); + } + } + if(name == L"discard-on-ambiguity") + { + if(type != XML_READER_TYPE_END_ELEMENT) + { + procDiscardOnAmbiguity(); + } + } + + xmlFreeTextReader(reader); + xmlCleanupParser(); + + newConstant(L"kMOT"); + newConstant(L"kDOLLAR"); + newConstant(L"kBARRA"); + newConstant(L"kMAS"); + newConstant(L"kIGNORAR"); + newConstant(L"kBEGIN"); + newConstant(L"kUNKNOWN"); + + plist->insert((*tag_index)[L"TAG_LPAR"], L"", L"lpar"); + plist->insert((*tag_index)[L"TAG_RPAR"], L"", L"rpar"); + plist->insert((*tag_index)[L"TAG_LQUEST"], L"", L"lquest"); + plist->insert((*tag_index)[L"TAG_CM"], L"", L"cm"); + plist->insert((*tag_index)[L"TAG_SENT"], L"", L"sent"); +// plist->insert((*tag_index)[L"TAG_kMAS"], L"+", L""); + plist->buildTransducer(); +} + +TaggerData & +TSXReader::getTaggerData() +{ + return tdata; +} Index: branches/apertium-tagger/apertium2/apertium/tsx_reader.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tsx_reader.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tsx_reader.h (revision 69632) @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TSXREADER_ +#define _TSXREADER_ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace std; + +class TSXReader +{ +private: + xmlTextReaderPtr reader; + set *open_class; + vector *forbid_rules; + map *tag_index; + vector *array_tags; + vector *enforce_rules; + vector *prefer_rules; + PatternList *plist; + ConstantManager *constants; + TaggerData tdata; + + int type; + wstring name; + + wstring attrib(wstring const &name); + + void parseError(wstring const &message); + void newTagIndex(wstring const &tag); + void newDefTag(wstring const &tag); + void newConstant(wstring const &constant); + void procDefLabel(); + void procDefMult(); + void procDiscardOnAmbiguity(); + void procTagset(); + void procForbid(); + void procLabelSequence(); + void procEnforce(); + void procPreferences(); + void destroy(); + void clearTagIndex(); + + void step(); +public: + TSXReader(); + ~TSXReader(); + + void read(string const &filename); + TaggerData & getTaggerData(); + +private: + void copy(TSXReader const &o); + TSXReader(TSXReader const &o); + TSXReader & operator =(TSXReader const &o); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/apertium.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium.1 (revision 69632) @@ -0,0 +1,115 @@ +.TH apertium 1 2006-03-08 "" "" +.SH NAME +apertium \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.sf.net\fR. +.SH SYNOPSIS +.B apertium +[\-d datadir] [\-f format] [\-u] [\-a] {language-pair} [infile [outfile]] +.SH DESCRIPTION +.BR apertium +is the application that most people will be using as it simplifies the +use of apertium/lt-toolbox tools for machine translation +purposes. +.PP +This tool tries to ease the use of \fIlt-toolbox\fR (which contains +all the lexical processing modules and tools) and \fIapertium\fR +(which contains the rest of the engine) by providing a unique +front-end to the end-user. +.PP +The different modules behind the apertium machine translation +architecture are in order: +.RS +\(bu \fIde-formatter:\fR Separates the text to be translated from the +format information. +.PP +\(bu \fImorphological-analyser:\fR Tokenizes the text in surface forms. +.PP +\(bu \fIpart-of-speech tagger:\fR Chooses one surface forms among +homographs. +.PP +\(bu \fIlexical transfer module:\fR Reads each source-language lexical +form and delivers a corresponding target-language lexical form. +.PP +\(bu \fIstructural transfer module:\fR Detects fixed-length patterns +of lexical forms (chunks or phrases) needing special processing due to +grammatical divergences between the two languages and performs the +corresponding transformations. +.PP +\(bu \fImorphological generator:\fR Delivers a target-language surface +form for each target-language lexical form, by suitably inflecting it. +.PP +\(bu \fIpost-generator:\fR Performs orthographical operations such as +contractions and apostrophations. +.PP +\(bu \fIre-formatter:\fR Restores the format information encapsulated +by the de-formatter into the translated text and removes the +encapsulation sequences used to protect certain characters in the +source text. +.RE +.SH OPTIONS +.PP +.B -d datadir +The directory holding the linguistic data. By default it will use the +expected installation path. +.PP +.B language-pair +The language pair: LANG1-LANG2 (for instance \fIes-ca\fR or \fIca-es\fR). +.PP +.B -f format +Specifies the format of the input and output files which can have +these values: +.RS +\(bu \fItxt\fR \fB(default value)\fR Input and output files are in +text format. +.PP +\(bu \fIhtml\fR Input and output files are in "html" format. This +"html" is the one accepted by the vast majority of web browsers. +.PP +\(bu \fIhtml-noent\fR Input and output files are in "html" format, but +preserving native encoding characters rather than using HTML text +entities. +.PP +\(bu \fIrtf\fR Input and output files are in "rtf" format. The +accepted "rtf" is the one generated by \fBMicrosoft WordPad (C)\fR and +\fBMicrosoft Office (C)\fR up to and including \fBOffice-97\fR. +.RE +.PP +.B -u +Disable marking of unknown words with the '*' character. +.PP +.B -a +Enable marking of disambiguated words with the '=' character. +.RS +.SH FILES +These are the two files that can be used with this command: +.PP +.B -m memory.tmx +use a translation memory to recycle translations +.PP +.B -o direction +translation direction using the translation memory, by default 'direction' is used instead +.PP +.B -l +lists the available translation directions and exits direction +typically, LANG1-LANG2, but see modes.xml in language data +.PP +.B infile +Input file (stdin by default). +.PP +.B outfile +Output file (stdout by default). +.PP +.SH SEE ALSO +.I lt-proc\fR(1), +.I lt-comp\fR(1), +.I lt-expand\fR(1), +.I apertium-tagger\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-multiple-translations.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-multiple-translations.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-multiple-translations.cc (revision 69632) @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef WIN32 +#if defined(__MINGW32__) +#define __MSVCRT_VERSION__ 0x0800 +#endif +#include +#include +#endif + +using namespace std; + +void message(char *progname) +{ + cerr << "USAGE: " << basename(progname) << " preproc biltrans [input [output]]" << endl; + cerr << " preproc result of preprocess trules file" << endl; + cerr << " biltrans bilingual letter transducer file" << endl; + cerr << " input input file, standard input by default" << endl; + cerr << " output output file, standard output by default" << endl; + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + + if(argc > 5 || argc <3) + { + message(argv[0]); + } + + for(unsigned int i = 1; i < 3; i++) + { + struct stat mybuf; + if(stat(argv[i], &mybuf) == -1) + { + cerr << "Error: can't stat file '"; + cerr << argv[i] << "'." << endl; + exit(EXIT_FAILURE); + } + } + + FILE *input = stdin, *output = stdout; + if(argc >= 4) + { + input = fopen(argv[3], "r"); + if(!input) + { + cerr << "Error: can't open input file '" << argv[3] << "'." << endl; + exit(EXIT_FAILURE); + } + if(argc == 5) + { + output = fopen(argv[4], "w"); + if(!output) + { + cerr << "Error: can't open output file '"; + cerr << argv[4] << "'." << endl; + exit(EXIT_FAILURE); + } + } + } +#ifdef WIN32 + _setmode(_fileno(input), _O_U8TEXT); + _setmode(_fileno(output), _O_U8TEXT); +#endif + + TransferMult t; + t.read(argv[1], argv[2]); + + t.transfer(input, output); + return EXIT_SUCCESS; +} Index: branches/apertium-tagger/apertium2/apertium/apertium_filter_ambiguity.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_filter_ambiguity.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_filter_ambiguity.cc (revision 69632) @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef _MSC_VER +#include +#include +#endif + +using namespace Apertium; +using namespace std; + +FILE * open_file(char const *filename, char const *mode) +{ + FILE *retval; + + struct stat var; + if(stat(filename, &var)) + { + cerr << "Can't stat '" << filename << "'" << endl; + exit(EXIT_FAILURE); + } + + retval = fopen(filename, mode); + + if(!retval) + { + cerr << "Can't open '" << filename << "'" << endl; + exit(EXIT_FAILURE); + } +#ifdef _MSC_VER + _setmode(_fileno(retval), _O_U8TEXT); +#endif + + return retval; +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + + if(argc < 2 || argc > 4) + { + cerr << "USAGE: " << basename(argv[0]) << " tsx_file [input [output]" << endl; + exit(EXIT_FAILURE); + } + + FILE *input = stdin, *output = stdout; + switch(argc) + { + case 4: + output = open_file(argv[3], "w"); + // no break + case 3: + input = open_file(argv[2], "r"); + // no break + case 2: + default: + break; + } + + TSXReader reader; + reader.read(argv[1]); + + TaggerWord::setArrayTags(reader.getTaggerData().getArrayTags()); + + TaggerDataHMM tdhmm(reader.getTaggerData()); + HMM hmm(&tdhmm); + hmm.filter_ambiguity_classes(input, output); + + return EXIT_SUCCESS; +} Index: branches/apertium-tagger/apertium2/apertium/apertium_re.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_re.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_re.h (revision 69632) @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _APERTIUM_RE_ +#define _APERTIUM_RE_ + +#include +#include +#include + +using namespace std; + +class ApertiumRE +{ +private: + bool empty; + pcre *re; +public: + ApertiumRE(); + ~ApertiumRE(); + void read(FILE *); + void write(FILE *) const; + string match(string const &str) const; + void replace(string &str, string const &value) const; + void compile(string const &str); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/collection.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/collection.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/collection.cc (revision 69632) @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include + +using namespace Apertium; + +int +Collection::size() +{ + return element.size(); +} + +bool +Collection::has_not(const set &t) +{ + return index.find(t) == index.end(); +} + +const set & +Collection::operator[](int n) +{ + return *element[n]; +} + +int & +Collection::operator[](const set &t) +{ + if(has_not(t)) + { + index[t] = index.size()-1; + element.push_back(&(index.find(t)->first)); + } + return index[t]; +} + +int & +Collection::add(const set &t) +{ + index[t] = index.size()-1; + element.push_back(&(index.find(t)->first)); + return index[t]; +} + +void +Collection::write(FILE *output) +{ + Compression::multibyte_write(element.size(), output); + + for(int i = 0, limit = element.size(); i != limit; i++) + { + Compression::multibyte_write(element[i]->size(), output); + for(set::const_iterator it = element[i]->begin(), + limit2 = element[i]->end(); it != limit2; it++) + { + Compression::multibyte_write(*it, output); + } + } +} + +void +Collection::read(FILE *input) +{ + int size = Compression::multibyte_read(input); + + for(; size != 0; size--) + { + set myset; + int set_size = Compression::multibyte_read(input); + for(; set_size != 0; set_size--) + { + myset.insert(Compression::multibyte_read(input)); + } + add(myset); + } +} Index: branches/apertium-tagger/apertium2/apertium/collection.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/collection.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/collection.h (revision 69632) @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef __COLLECTION_H +#define __COLLECTION_H + +#include +#include +#include +#include + +using namespace std; + +/** Collection + * Is an indexed set. + */ +class Collection { + map , int> index; + vector *> element; +public: + /** Returns the collection's size. + */ + int size (void); + + /** Checks whether or not the collection has the element received as + * a parameter. + * @param t element @return true if t is not in the + * collection + */ + bool has_not (const set& t); + + /** @param n position in the collection + * @return the element at the n-th position + */ + const set& operator[] (int n); + + /** If the element received as a parameter does not appear in the + * collection, it is added at the end. + * @param t an element @return + * the position in which t appears in the collection. + */ + int& operator[] (const set& t); + + /** Adds an element to the collection + * @param t the element to be added + */ + int& add(const set& t); + + /** + * Write the collection contents to an output stream + * @param output the output stream + */ + void write(FILE *output); + + /** + * Reads the collection contents from an input stream + * @param input the input stream + */ + void read(FILE *input); +}; + + +#endif Index: branches/apertium-tagger/apertium2/apertium/constant_manager.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/constant_manager.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/constant_manager.cc (revision 69632) @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include + +using namespace Apertium; +void +ConstantManager::copy(ConstantManager const &o) +{ + constants = o.constants; +} + +void +ConstantManager::destroy() +{ +} + +ConstantManager::ConstantManager() +{ +} + +ConstantManager::~ConstantManager() +{ + destroy(); +} + +ConstantManager::ConstantManager(ConstantManager const &o) +{ + copy(o); +} + +ConstantManager & +ConstantManager::operator =(ConstantManager const &o) +{ + if(this != &o) + { + destroy(); + copy(o); + } + return *this; +} +void +ConstantManager::setConstant(wstring const &constant, int const value) +{ + constants[constant] = value; +} + +int +ConstantManager::getConstant(wstring const &constant) +{ + return constants[constant]; +} + +void +ConstantManager::write(FILE *output) +{ + Compression::multibyte_write(constants.size(), output); + + for(map::const_iterator it = constants.begin(), limit = constants.end(); + it != limit; it++) + { + Compression::wstring_write(it->first, output); + Compression::multibyte_write(it->second, output); + } +} + +void +ConstantManager::read(FILE *input) +{ + constants.clear(); + int size = Compression::multibyte_read(input); + for(int i = 0; i != size; i++) + { + wstring mystr = Compression::wstring_read(input); + constants[mystr] = Compression::multibyte_read(input); + } +} Index: branches/apertium-tagger/apertium2/apertium/constant_manager.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/constant_manager.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/constant_manager.h (revision 69632) @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _CONSTANTMANAGER_ +#define _CONSTANTMANAGER_ + +#include +#include +#include + +using namespace std; + +class ConstantManager +{ +private: + map constants; + + void copy(ConstantManager const &o); + void destroy(); +public: + ConstantManager(); + ~ConstantManager(); + ConstantManager(ConstantManager const &o); + ConstantManager & operator =(ConstantManager const &o); + + void setConstant(wstring const &constant, int const value); + int getConstant(wstring const &constant); + void write(FILE *output); + void read(FILE *input); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/deformat.xsl =================================================================== --- branches/apertium-tagger/apertium2/apertium/deformat.xsl (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/deformat.xsl (revision 69632) @@ -0,0 +1,870 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%{ + +#include <cstdlib> +#include <iostream> +#include <map> +#include <string> +#include <vector> + +extern "C" { +#if !defined(__STDC__) +# define __STDC__ 1 +#endif +#include <regex.h> +} + +#include <string> +#include <lttoolbox/lt_locale.h> +#include <lttoolbox/ltstr.h> +#ifndef GENFORMAT +#include "apertium_config.h" +#endif +#include <apertium/unlocked_cstdio.h> +#ifdef _WIN32 +#include <io.h> +#include <fcntl.h> +#endif + +using namespace std; + +wstring buffer; +string symbuf; +bool isDot, hasWrite_dot, hasWrite_white; +bool eosIncond; +bool noDot; +FILE *formatfile; +string last; +int current; +long int offset; + + +vector<long int> offsets; +vector<wstring> tags; +vector<int> orders; + +regex_t escape_chars; +regex_t names_regexp; + +void bufferAppend(wstring &buf, string const &str) +{ + symbuf.append(str); + + for(size_t i = 0, limit = symbuf.size(); i < limit;) + { + wchar_t symbol; + int gap = mbtowc(&symbol, symbuf.c_str() + i, MB_CUR_MAX); + if(gap == -1) + { + if(i + MB_CUR_MAX < limit) + { + buf += L'?'; + gap = 1; + } + else + { + symbuf = symbuf.substr(i); + return; + } + } + else + { + buf += symbol; + } + + i += gap; + } + + symbuf = ""; + return; +} + + +void init_escape() +{ + if(regcomp(&escape_chars, " + + + + ", REG_EXTENDED)) + { + cerr << "ERROR: Illegal regular expression for escape characters" << endl; + exit(EXIT_FAILURE); + } +} + +void init_tagNames() +{ + if(regcomp(&names_regexp, " + + + + ", REG_EXTENDED)) + { + cerr << "ERROR: Illegal regular expression for tag-names" << endl; + exit(EXIT_FAILURE); + } +} + +string backslash(string const &str) +{ + string new_str; + + for(unsigned int i = 0; i < str.size(); i++) + { + if(str[i] == '\\') + { + new_str += str[i]; + } + new_str += str[i]; + } + + return new_str; +} + + +wstring escape(string const &str) +{ + regmatch_t pmatch; + + char const *mystring = str.c_str(); + int base = 0; + wstring result; + + while(!regexec(&escape_chars, mystring + base, 1, &pmatch, 0)) + { + bufferAppend(result, str.substr(base, pmatch.rm_so)); + result += L'\\'; + wchar_t micaracter; + int pos = mbtowc(&micaracter, str.c_str() + base + pmatch.rm_so, MB_CUR_MAX); + if(pos == -1) + { + wcerr << L"Uno" << endl; + wcerr << L"Encoding error." << endl; + exit(EXIT_FAILURE); + } + + result += micaracter; + base += pmatch.rm_eo; + } + + bufferAppend(result, str.substr(base)); + return result; +} + +wstring escape(wstring const &str) +{ + string dest; + + for(size_t i = 0, limit = str.size(); i < limit; i++) + { +#ifdef __GNUC__ + char symbol[MB_CUR_MAX+1]; +#else + std::string _symbol(MB_CUR_MAX+1, 0); + char *symbol = &_symbol[0]; +#endif + int pos = wctomb(symbol, str[i]); + if(pos == -1) + { + symbol[0]='?'; + pos = 1; + } + symbol[pos] = 0; + dest.append(symbol); + } + return escape(dest); +} + +string get_tagName(string tag){ + regmatch_t pmatch; + + char const *mystring = tag.c_str(); + string result; + if(!regexec(&names_regexp, mystring, 1, &pmatch, 0)) + { + result=tag.substr(pmatch.rm_so, pmatch.rm_eo - pmatch.rm_so); + return result; + } + + return ""; +} + + + + + + + + + + + + + + + + + + + + + + + + +int get_index(string end_tag){ + string new_end_tag; + size_t pos; + + for (int i=tags.size()-1; i >= 0; i--) { + // a wchar to char conversion can be up to 4 times larger + char *tmp = new char (sizeof(char)*((tags[i].size()+1) * 4)); + // Keep the existing memset. Better safe than sorry. + memset(tmp, '\0', tags[i].size() + 1); + + pos = wcstombs(tmp, tags[i].c_str(), tags[i].size()); + if (pos == (size_t)-1) + { + wcerr << L"Encoding error." << endl; + exit(EXIT_FAILURE); + } + new_end_tag = tmp; + delete[] tmp; + + if (get_tagName(end_tag) == get_tagName(new_end_tag)) + return i; + } + + return -1; +} + +void print_emptyTags() { + wchar_t tag[250]; + + for (size_t i=0; i < tags.size(); i++) { + swprintf(tag, 250, L"<format-tag offset=\"%d\" order= \"%d\"><![CDATA[", offsets[i], orders[i]); + fputws(tag, formatfile); + fputws(tags[i].c_str(), formatfile); + fputwc(L']', formatfile); + swprintf(tag, 250, L"]></format-tag>\n"); + fputws(tag, formatfile); + } +} + + + + +void printBuffer(int ind=-1, string end_tag="") +{ + wchar_t tag[250]; + wstring etiketa; + wstring wend_tag; + size_t pos; + int num; + wchar_t result[end_tag.size() + 1]; + + // Convert end_tag to wstring + pos = mbstowcs(result, end_tag.c_str(), end_tag.size()); + if (pos == (size_t) -1) + { + wcerr << L"Encoding error." << endl; + exit(EXIT_FAILURE); + } + result[pos] = L'\0'; + wend_tag = result; + + if (ind != -1 && ind == tags.size()-1 && + offsets[ind] == offset && orders[ind] == current) + { + last = "buffer"; + buffer = tags.back() + buffer + wend_tag; + tags.pop_back(); + offsets.pop_back(); + orders.pop_back(); + } + else if (ind == -1 && wend_tag != L"") + { + last = "buffer"; + buffer = buffer + wend_tag; + } + else + { + if (hasWrite_dot && isDot) + { + swprintf(tag, 250, L"<empty-tag offset=\"%d\"/>\n", offset+1); + fputws(tag, formatfile); + + fputws(L" .\n", yyout); + offset += 2; + hasWrite_dot = false; + } + + isDot = false; + + if ((buffer.size() == 1 && buffer[0] != ' ') || buffer.size() > 1) + { + if (hasWrite_white) + { + fputws(L" ", yyout); + offset++; + hasWrite_white = false; + } + + current++; + + swprintf(tag, 250, L"<format-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); + fputws(tag, formatfile); + while ((pos = buffer.find(L"]]>")) != wstring::npos) + buffer.replace(pos, 3, L"\\]\\]\\>"); + fputws(buffer.c_str(), formatfile); + swprintf(tag, 250, L"]]></format-tag>\n"); + fputws(tag, formatfile); + } + else + { + fputws(buffer.c_str(), yyout); + offset += buffer.size(); + } + + + if (ind != -1) + { + if (hasWrite_white) + { + fputws(L" ", yyout); + offset++; + hasWrite_white = false; + } + + num = swprintf(tag, 250, L"<open-close-tag>\n"); + swprintf(tag + num, 250 - num, L"<open-tag offset=\"%d\" order=\"%d\"><![CDATA[", offsets[ind], orders[ind]); + fputws(tag, formatfile); + etiketa = tags[ind]; + while ((pos = etiketa.find(L"]]>")) != wstring::npos) + etiketa.replace(pos, 3, L"\\]\\]\\>"); + fputws(etiketa.c_str(), formatfile); + + current++; + + num = swprintf(tag, 250, L"]]></open-tag>\n"); + swprintf(tag + num, 250 - num, L"<close-tag offset=\"%d\" order=\"%d\"><![CDATA[", offset, current); + fputws(tag, formatfile); + while ((pos = wend_tag.find(L"]]>")) != wstring::npos) + wend_tag.replace(pos, 3, L"\\]\\]\\>"); + fputws(wend_tag.c_str(), formatfile); + num = swprintf(tag, 250, L"]]></close-tag>\n"); + swprintf(tag + num, 250 - num, L"</open-close-tag>\n"); + fputws(tag, formatfile); + + tags.erase(tags.begin() + ind); + offsets.erase(offsets.begin() + ind); + orders.erase(orders.begin() + ind); + } + + + last = "buffer"; + buffer = L""; + } + +} + + + +void preDot() +{ + if(eosIncond) + { + if(noDot) + { + fputws_unlocked(L"[]", yyout); + } + else + { + fputws_unlocked(L".[]", yyout); + } + } +} + +void printBuffer() +{ + if(isDot && !eosIncond) + { + if(noDot) + { + fputws_unlocked(L"[]", yyout); + } + else + { + fputws_unlocked(L".[]", yyout); + } + isDot = false; + } + if(buffer.size() > ) + { + string filename = tmpnam(NULL); + FILE *largeblock = fopen(filename.c_str(), "w"); + fputws_unlocked(buffer.c_str(), largeblock); + fclose(largeblock); + preDot(); + fputwc_unlocked(L'[', yyout); + fputwc_unlocked(L'@', yyout); + wchar_t cad[filename.size()]; + size_t pos = mbstowcs(cad, filename.c_str(), filename.size()); + if(pos == (size_t) -1) + { + wcerr << L"Tres" << endl; + + wcerr << L"Encoding error." << endl; + exit(EXIT_FAILURE); + } + cad[pos] = 0; + fputws_unlocked(cad, yyout); + fputwc_unlocked(L']', yyout); + } + else if(buffer.size() > 1) + { + preDot(); + fputwc_unlocked(L'[', yyout); + wstring const tmp = escape(buffer); + if(tmp[0] == L'@') + { + fputwc_unlocked(L'\\', yyout); + } + fputws_unlocked(tmp.c_str(), yyout); + fputwc_unlocked(L']', yyout); + } + else if(buffer.size() == 1 && buffer[0] != L' ') + { + preDot(); + fputwc_unlocked(L'[', yyout); + wstring const tmp = escape(buffer); + if(tmp[0] == L'@') + { + fputwc_unlocked(L'\\', yyout); + } + fputws_unlocked(tmp.c_str(), yyout); + + fputwc_unlocked(L']', yyout); + } + else + { + fputws_unlocked(buffer.c_str(), yyout); + } + + buffer = L""; +} + + +%} + + + + + + + + +%option nounput +%option noyywrap +%option caseless +%option stack + +%% + + + + +<>{ + + + + + + + + + + + + + + + + + + + + { + last = "buffer"; + bufferAppend(buffer, yytext); + yy_pop_state(); +} + + \n|. { + last = "buffer"; + bufferAppend(buffer, yytext); +} + +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { + if (last == "open_tag") + bufferAppend(tags.back(), yytext); + else + bufferAppend(buffer, yytext); + +} + + { + printBuffer(); + fputwc_unlocked(L'\\', yyout); + offset++; + wchar_t symbol; + int pos = mbtowc(&symbol, yytext, MB_CUR_MAX); + if(pos == -1) + { + wcerr << L"Cuatro" << endl; + + wcerr << L"Encoding error." << endl; + exit(EXIT_FAILURE); + } + + fputwc_unlocked(symbol, yyout); + offset++; + hasWrite_dot = hasWrite_white = true; + +} + +. { + printBuffer(); + symbuf += yytext; + wchar_t symbol; + int pos = mbtowc(&symbol, symbuf.c_str(), MB_CUR_MAX); + if(pos == -1) + { + if(symbuf.size() > (size_t) MB_CUR_MAX) + { + // unknown character + symbuf = ""; + fputwc_unlocked(L'?', yyout); + offset++; + hasWrite_dot = hasWrite_white = true; + } + } + else + { + symbuf = ""; + fputwc_unlocked(symbol, yyout); + offset++; + hasWrite_dot = hasWrite_white = true; + } +} + +<<EOF>> { + isDot = true; + + preDot(); + printBuffer(); + return 0; +} +%% + + + +void usage(string const &progname) +{ + + + cerr << "USAGE: " << progname << " format_file [input_file [output_file]" << ']' << endl; + + + cerr << "USAGE: " << progname << " [ -h | -i | -n ] [input_file [output_file]" << ']' << endl; + + + cerr << " format processor " << endl; + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + size_t base = 0; + eosIncond = false; + + if(argc >= 2) + { + if(!strcmp(argv[1],"-i")) + { + eosIncond = true; + base++; + } + else if(!strcmp(argv[1],"-n")) + { + noDot = true; + base++; + } + } + + + if(argc > 4 || argc < 2) + { + usage(argv[0]); + } + + switch(argc-base) + { + case 4: + yyout = fopen(argv[3+base], "w"); + if(!yyout) + { + usage(argv[0]); + } + case 3: + yyin = fopen(argv[2+base], "r"); + if(!yyin) + { + usage(argv[0]); + } + case 2: + formatfile = fopen(argv[1+base], "w"); + if(!formatfile) + { + usage(argv[0]); + } + break; + default: + break; + } + + + if((argc-base) > 4) + { + usage(argv[0]); + } + + switch(argc-base) + { + case 3: + yyout = fopen(argv[2+base], "w"); + if(!yyout) + { + usage(argv[0]); + } + case 2: + yyin = fopen(argv[1+base], "r"); + if(!yyin) + { + usage(argv[0]); + } + break; + default: + break; + } + + +#ifdef _WIN32 + _setmode(_fileno(yyin), _O_U8TEXT); + _setmode(_fileno(yyout), _O_U8TEXT); +#endif + // prevent warning message + yy_push_state(1); + yy_top_state(); + yy_pop_state(); + + + + + + + + + fputws(L"<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n", formatfile); + fputws(L"<format>\n", formatfile); + + + last = ""; + buffer = L""; + isDot = hasWrite_dot = hasWrite_white = false; + current=0; + offset = 0; + init_escape(); + init_tagNames(); + yylex(); + + + print_emptyTags(); + fputws(L"</format>", formatfile); + fclose(formatfile); + + fclose(yyin); + fclose(yyout); +} + + Index: branches/apertium-tagger/apertium2/apertium/endian_double_util.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/endian_double_util.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/endian_double_util.cc (revision 69632) @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +using namespace std; + +double +EndianDoubleUtil::read(FILE *input) +{ + double retval; +#ifdef WORDS_BIGENDIAN + fread_unlocked(&retval, sizeof(double), 1, input); +#else + char *s = reinterpret_cast(&retval); + + for(int i = sizeof(double)-1; i != -1; i--) + { + if(fread_unlocked(&(s[i]), 1, 1, input)==0) + { + return 0; + } + } +#endif + return retval; +} + +double +EndianDoubleUtil::read(istream &is) +{ + double retval; +#ifdef WORDS_BIGENDIAN + is.read((char *) &retval, sizeof(double)); +#else + char *s = reinterpret_cast(&retval); + + for(int i = sizeof(double)-1; i != -1; i--) + { + is.read(&(s[i]), sizeof(char)); + } +#endif + return retval; +} + +void +EndianDoubleUtil::write(FILE *output, double const &val) +{ + double val2 = val; +#ifdef WORDS_BIGENDIAN + fwrite(&val2, sizeof(double), 1, output); +#else + char *s = reinterpret_cast(&val2); + + for(int i = sizeof(double)-1; i != -1; i--) + { + fwrite(&(s[i]), 1, 1, output); + } +#endif +} + +void +EndianDoubleUtil::write(ostream &os, double const &val) +{ + double val2 = val; +#ifdef WORDS_BIGENDIAN + os.write(reinterpret_cast(&val2), sizeof(double)); +#else + char *s = reinterpret_cast(&val2); + + for(int i = sizeof(double)-1; i != -1; i--) + { + os.write(&(s[i]), sizeof(char)); + } +#endif +} Index: branches/apertium-tagger/apertium2/apertium/endian_double_util.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/endian_double_util.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/endian_double_util.h (revision 69632) @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _ENDIANDOUBLEUTIL_ +#define _ENDIANDOUBLEUTIL_ + +#include +#include +#include + +using namespace std; + +/** + * Generic class to process correctly endian-enabled I/O operations + */ +class EndianDoubleUtil +{ +public: + /** + * Read procedure. + * @param input the stream to read from. + * @returns the first element readed from the current position of the stream + */ + static double read(FILE *input); + + /** + * Read procedure, C++ I/O version. + * @param is the stream to read from. + * @returns the first element readed from the current position of the stream + */ + static double read(istream &is); + + /** + * Write procedure. + * @param output the stream to write to + * @param val the value of the generic object to write to the stream + */ + static void write(FILE *output, double const &val); + + /** + * Write procedure, C++ I/O version. + * @param output the stream to write to + * @param val the value of the generic object to write to the stream + */ + static void write(ostream &os, double const &val); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/format.dtd =================================================================== --- branches/apertium-tagger/apertium2/apertium/format.dtd (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/format.dtd (revision 69632) @@ -0,0 +1,141 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/interchunk.dtd =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk.dtd (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk.dtd (revision 69632) @@ -0,0 +1,442 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/interchunk_word.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk_word.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk_word.cc (revision 69632) @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include + +using namespace Apertium; + +void +InterchunkWord::copy(InterchunkWord const &o) +{ + this->chunk = o.chunk; +} + +void +InterchunkWord::destroy() +{ +} + +InterchunkWord::InterchunkWord() +{ +} + +InterchunkWord::InterchunkWord(string const &chunk) +{ + init(chunk); +} + +InterchunkWord::~InterchunkWord() +{ + destroy(); +} + +InterchunkWord::InterchunkWord(InterchunkWord const &o) +{ + copy(o); +} + +InterchunkWord & +InterchunkWord::operator =(InterchunkWord const &o) +{ + if(this != &o) + { + destroy(); + copy(o); + } + return *this; +} + +void +InterchunkWord::init(string const &chunk) +{ + for(size_t i = 0; i < chunk.size(); i++) + { + if(chunk[i] == '\\') + { + i++; + } + else if(chunk[i] == '{') + { + this->chunk = chunk.substr(0, i); + this->queue = chunk.substr(i); + return; + } + } + this->chunk = chunk; + this->queue = ""; +} + +string +InterchunkWord::chunkPart(ApertiumRE const &part) +{ + string result = part.match(chunk); + if(result.size() == 0) + { + result = part.match(queue); + if(result.size() != queue.size()) + { + return ""; + } + else + { + return result; + } + } + else if(result.size() == chunk.size()) + { + return part.match(chunk+queue); + } + else + { + return result; + } +} + +void +InterchunkWord::setChunkPart(ApertiumRE const &part, string const &value) +{ + part.replace(chunk, value); +} Index: branches/apertium-tagger/apertium2/apertium/interchunk_word.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/interchunk_word.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/interchunk_word.h (revision 69632) @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _INTERCHUNKWORD_ +#define _INTERCHUNKWORD_ + +#include +#include +#include + +using namespace std; + +/** + * Word type for transfer modules + */ +class InterchunkWord +{ +private: + /** + * Target language chunk name and tags + */ + string chunk; + + /** + * Target language chunk content + */ + string queue; + + /** + * Copy method + * @param o the object to be copied + */ + void copy(InterchunkWord const &o); + + /** + * Destroy method + */ + void destroy(); + +public: + /** + * Non-parametric constructor + */ + InterchunkWord(); + /** + * Destructor + */ + ~InterchunkWord(); + + /** + * Copy constructor + * @param o the object to be copied + */ + InterchunkWord(InterchunkWord const &o); + + /** + * Parametric constructor calling init() + * @param chunk the chunk + */ + InterchunkWord(string const &chunk); + + /** + * Assignment operator + * @param o the object to be assigned + * @return reference to left part of assignment + */ + InterchunkWord & operator =(InterchunkWord const &o); + + /** + * Sets a chunk + * @param chunk the chunk + */ + void init(string const &chunk); + + /** + * Reference a chunk part + * @param part regular expression to match + * @returns reference to the part of string matched + */ + string chunkPart(ApertiumRE const &part); + + /** + * Sets a value for a chunk part + * @param part regular expression to match + * @param value the new value for the given part + */ + void setChunkPart(ApertiumRE const &part, string const &value); + +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/latex_accentsmap.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/latex_accentsmap.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/latex_accentsmap.cc (revision 69632) @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include + +using namespace std; + + +AccentsMap::AccentsMap(bool char2latex) { + if(char2latex) + init_camap(); + else + init_acmap(); +} + +AccentsMap::~AccentsMap(){ +} + +void AccentsMap::init_acmap() { + init_camap(); + for (acmap::iterator i = map.begin(); + i != map.end(); + ++i) + { + map[i->second] = i->first; + } +} + +void AccentsMap::init_camap() { + + map[L"Ă "] = L"`a"; // Grave accent + map[L"Ăš"] = L"`e"; + map[L"ĂŹ"] = L"`\\i"; + map[L"ĂČ"] = L"`o"; + map[L"Ăč"] = L"`u"; + map[L"ỳ"] = L"`y"; + map[L"À"] = L"`A"; + map[L"È"] = L"`E"; + map[L"Ì"] = L"`I"; + map[L"Ò"] = L"`O"; + map[L"Ù"] = L"`U"; + map[L"á»Č"] = L"`Y"; + map[L"ĂĄ"] = L"'a"; // Acute accent + map[L"Ă©"] = L"'e"; + map[L"Ă­"] = L"'\\i"; + map[L"Ăł"] = L"'o"; + map[L"Ăș"] = L"'u"; + map[L"Ăœ"] = L"'y"; + map[L"Á"] = L"'A"; + map[L"É"] = L"'E"; + map[L"Í"] = L"'I"; + map[L"Ó"] = L"'O"; + map[L"Ú"] = L"'U"; + map[L"Ý"] = L"'Y"; + map[L"Ăą"] = L"^a"; // Circumflex + map[L"ĂȘ"] = L"^e"; + map[L"Ăź"] = L"^\\i"; + map[L"ĂŽ"] = L"^o"; + map[L"Ă»"] = L"^u"; + map[L"Ć·"] = L"^y"; + map[L"Â"] = L"^A"; + map[L"Ê"] = L"^E"; + map[L"Î"] = L"^I"; + map[L"Ô"] = L"^O"; + map[L"Û"] = L"^U"; + map[L"ƶ"] = L"^Y"; + map[L"Ă€"] = L"\"a"; // Umlaut or dieresis + map[L"Ă«"] = L"\"e"; + map[L"ĂŻ"] = L"\"\\i"; + map[L"ö"] = L"\"o"; + map[L"ĂŒ"] = L"\"u"; + map[L"Ăż"] = L"\"y"; + map[L"Ä"] = L"\"A"; + map[L"Ë"] = L"\"E"; + map[L"Ï"] = L"\"I"; + map[L"Ö"] = L"\"O"; + map[L"Ü"] = L"\"U"; + map[L"Ćž"] = L"\"Y"; + + map[L"ñ"] = L"~n"; + map[L"Ñ"] = L"~N"; + + map[L"ç"] = L"cc"; // Cedilla + map[L"Ç"] = L"cC"; + + +} + +wstring AccentsMap::get(wstring input){ + it = map.find(input); + if(it == map.end()) + return L""; + else + return (*it).second; +} + +//Optionally: +void AccentsMap::init_locale(){ + char *locale = setlocale(LC_ALL, ""); + std::locale lollocale(locale); + wcout.imbue(lollocale); +} + + + +/*latexAccents = [ + map[L"Ă "] = L"\\`a"; # Grave accent + map[L"Ăš"] = L"\\`e"; + map[L"ĂŹ"] = L"\\`\\i"; + map[L"ĂČ"] = L"\\`o"; + map[L"Ăč"] = L"\\`u"; + map[L"ỳ"] = L"\\`y"; + map[L"À"] = L"\\`A"; + map[L"È"] = L"\\`E"; + map[L"Ì"] = L"\\`\\I"; + map[L"Ò"] = L"\\`O"; + map[L"Ù"] = L"\\`U"; + map[L"á»Č"] = L"\\`Y"; + map[L"ĂĄ"] = L"\\'a"; # Acute accent + map[L"Ă©"] = L"\\'e"; + map[L"Ă­"] = L"\\'\\i"; + map[L"Ăł"] = L"\\'o"; + map[L"Ăș"] = L"\\'u"; + map[L"Ăœ"] = L"\\'y"; + map[L"Á"] = L"\\'A"; + map[L"É"] = L"\\'E"; + map[L"Í"] = L"\\'\\I"; + map[L"Ó"] = L"\\'O"; + map[L"Ú"] = L"\\'U"; + map[L"Ý"] = L"\\'Y"; + map[L"Ăą"] = L"\\^a"; # Circumflex + map[L"ĂȘ"] = L"\\^e"; + map[L"Ăź"] = L"\\^\\i"; + map[L"ĂŽ"] = L"\\^o"; + map[L"Ă»"] = L"\\^u"; + map[L"Ć·"] = L"\\^y"; + map[L"Â"] = L"\\^A"; + map[L"Ê"] = L"\\^E"; + map[L"Î"] = L"\\^\\I"; + map[L"Ô"] = L"\\^O"; + map[L"Û"] = L"\\^U"; + map[L"ƶ"] = L"\\^Y"; + map[L"Ă€"] = L"\\\"a"; # Umlaut or dieresis + map[L"Ă«"] = L"\\\"e"; + map[L"ĂŻ"] = L"\\\"\\i"; + map[L"ö"] = L"\\\"o"; + map[L"ĂŒ"] = L"\\\"u"; + map[L"Ăż"] = L"\\\"y"; + map[L"Ä"] = L"\\\"A"; + map[L"Ë"] = L"\\\"E"; + map[L"Ï"] = L"\\\"\\I"; + map[L"Ö"] = L"\\\"O"; + map[L"Ü"] = L"\\\"U"; + map[L"Ćž"] = L"\\\"Y"; + map[L"ç"] = L"\\c{c}"; # Cedilla + map[L"Ç"] = L"\\c{C}"; + map[L"Ɠ"] = L"{\\oe}"; # Ligatures + map[L"ƒ"] = L"{\\OE}"; + map[L"ĂŠ"] = L"{\\ae}"; + map[L"Æ"] = L"{\\AE}"; + map[L"Ă„"] = L"{\\aa}"; + map[L"Å"] = L"{\\AA}"; + map[L"–"] = L"--"; # Dashes + map[L"—"] = L"---"; + map[L"Ăž"] = L"{\\o}"; # Misc latin-1 letters + map[L"Ø"] = L"{\\O}"; + map[L"ß"] = L"{\\ss}"; + map[L"ÂĄ"] = L"{!`}"; + map[L"Âż"] = L"{?`}"; + map[L"\\"] = L"\\\\"; # Characters that should be quoted + map[L"~"] = L"\\~"; + map[L"&"] = L"\\&"; + map[L"$"] = L"\\$"; + map[L"{"] = L"\\{"; + map[L"}"] = L"\\}"; + map[L"%"] = L"\\%"; + map[L"#"] = L"\\#"; + map[L"_"] = L"\\_"; + map[L"≄"] = L"$\\ge$"; # Math operators + map[L"≀"] = L"$\\le$"; + map[L"≠"] = L"$\\neq$"; + map[L"©"] = L"\copyright"; # Misc + map[L"ı"] = L"{\\i}"; + map[L"”"] = L"$\\mu$"; + map[L"°"] = L"$\\deg$"; + map[L"‘"] = L"`"; #Quotes + map[L"’"] = L"'"; + map[L"“"] = L"``"; + map[L"”"] = L"''"; + map[L"‚"] = L","; + map[L"„"] = L",,"; +]*/ + + + + + + Index: branches/apertium-tagger/apertium2/apertium/latex_accentsmap.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/latex_accentsmap.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/latex_accentsmap.h (revision 69632) @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +/*struct Ltstr // Already in lttoolbox/ltstr.h +{ + bool operator()(wstring const &s1, wstring const &s2) const + { + return wcscmp(s1.c_str(), s2.c_str()) < 0; + } +}; +*/ + +class AccentsMap { + typedef std::map acmap; + private: + acmap map; // Accent to character + acmap::iterator it; // Iterator for searching + + void init_acmap(); + void init_camap(); + public: + AccentsMap(bool char2accent); // the direction + ~AccentsMap(); + + // Optionally + void init_locale(); + + // The getter for both directions depending on init. + wstring get(wstring input); +}; + Index: branches/apertium-tagger/apertium2/apertium/lexchoice.xsl =================================================================== --- branches/apertium-tagger/apertium2/apertium/lexchoice.xsl (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lexchoice.xsl (revision 69632) @@ -0,0 +1,172 @@ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + +

+ + + + + + __ + + + + + + + + + + + __ + + + + + + + + +

+
+ + + + +

+ + + + + + __ + + + + + + + + + + + __ + + + + + + + + +

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ +
+ + + Index: branches/apertium-tagger/apertium2/apertium/lexchoicebil.xsl =================================================================== --- branches/apertium-tagger/apertium2/apertium/lexchoicebil.xsl (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lexchoicebil.xsl (revision 69632) @@ -0,0 +1,169 @@ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + +

+ + + + + __ + + + + + + + + + + + __ + + + + + + + + +

+
+ + + +

+ + + + + __ + + + + + + + + + + + __ + + + + + + + + +

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ +
+ + + Index: branches/apertium-tagger/apertium2/apertium/lextor.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/lextor.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lextor.h (revision 69632) @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante + * author: Felipe Sánchez-Martínez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef __LEXTOR_H +#define __LEXTOR_H + +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + +/** Class LexTor (Lexical Selector class) + */ + +class LexTor { +private: + LexTorData *lextor_data; + + //For usew when tl information is used to perform lexical selection + LexTorData *tlmodel; + FSTProcessor *fstpbil; + + int estimate_winner_lch(deque& window, int word_index, double weigth_exponent); + int estimate_winner_lch_voting(deque& window, int word_index, double weigth_exponent); + int estimate_winner_lch_cosine(deque& window, int word_index, double weigth_exponent); + int estimate_winner_lch_mostprob(deque& window, int word_index, double weigth_exponent); + int estimate_winner_lch_votingtl(deque& window, int word_index, double weigth_exponent); + + double cosine(map& vcontext, const wstring& reduced_lexchoice); +public: + + static bool debug; + static double angleth; + + LexTor(); + + LexTor(const LexTor& lt); + + ~LexTor(); + + void set_lextor_data(LexTorData* ltd); + + //Use to set the tlmodel to be used when tl information is used to + //perform lexical selection + void set_tlmodel(LexTorData* tlm); + void set_bildic(FSTProcessor *fstp); + + void trainwrd(wistream& wis, int left, int right, double weigth_exponent=0); + + void trainlch(wistream& wis, int left, int right, LexTorData& wordmodel, + FSTProcessor& dic, FSTProcessor& bildic, double weigth_exponent=0); + + void lexical_selector(wistream& wis, FSTProcessor &fstp, int left, int right, + double weigth_exponent=0, LexTorEval* lteval=NULL); + + /** NOTE on the weigth_exponent parameter: This parameter is used to + change the influence of surrounding words on the decision to + take on an ambiguous word (word with more than one lexical + choice). For example, if a decision is being take on word w_i, + the the weigth of the surrounding words is: + Score(w_i-2) = count(w_i-2)/pow(2,weigth_exponent), + Score(w_i-1) = count(w_i-1)/pow(1,weigth_exponent), + Score(w_i+1) = count(w_i+1)/pow(1,weigth_exponent), + Score(w_i+2) = count(w_i+2)/pow(2,weigth_exponent). + */ +}; + +class PairStringCountComparer { +public: + bool operator()(const pair& e1, const pair& e2) const { + //True if e1>e2 + + if (e1.second > e2.second) + return true; + else if (e1.second == e2.second) + return (e1.first>e2.first); + else + return false; + } +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/lextor_data.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/lextor_data.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lextor_data.cc (revision 69632) @@ -0,0 +1,527 @@ +/* + * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante + * author: Felipe Sánchez-Martínez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include + +#include +#include +#include + +using namespace Apertium; +LexTorData::LexTorData() { + n_stopwords=0; + n_words=0; + n_words_per_set=0; + n_set=0; + + index2word.push_back(NULLWORD); + word2index[NULLWORD]=0; + n_words++; +} + +LexTorData::LexTorData(const LexTorData& ltd) { + n_stopwords=ltd.n_stopwords; + n_words=ltd.n_words; + n_words_per_set=ltd.n_words_per_set; + n_set=ltd.n_set; + + word2index=ltd.word2index; + index2word=ltd.index2word; + + lexchoice_set=ltd.lexchoice_set; + lexchoice_sum=ltd.lexchoice_sum; + //lexchoice_prob=ltd.lexchoice_prob; + + stopwords=ltd.stopwords; + words=ltd.words; + lexical_choices=ltd.lexical_choices; + reduced_lexical_choices=ltd.reduced_lexical_choices; +} + +LexTorData::~LexTorData() { +} + +COUNT_DATA_TYPE +LexTorData::vote_from_word(const wstring& lexical_choice, const wstring& word) { + WORD_DATA_TYPE ind_lexchoice=word2index[StringUtils::tolower(lexical_choice)]; + WORD_DATA_TYPE ind_word=word2index[StringUtils::tolower(word)]; + + //To avoid creating a null entry in lexchoice_set[lexical_choice] + if (lexchoice_set[ind_lexchoice].find(ind_word)==lexchoice_set[ind_lexchoice].end()) + return 0; + else + return lexchoice_set[ind_lexchoice][ind_word]; +} + +//double +//LexTorData::get_lexchoice_prob(const string& lexical_choice) { +// return lexchoice_prob[word2index[lexical_choice]]; +//} + + +void +LexTorData::set_wordcount(const wstring& word, COUNT_DATA_TYPE c) { + WORD_DATA_TYPE ind_word=word2index[StringUtils::tolower(word)]; + wordcount[ind_word]=c; +} + +COUNT_DATA_TYPE +LexTorData::get_wordcount(const wstring& word) { + WORD_DATA_TYPE ind_word=word2index[StringUtils::tolower(word)]; + + if (wordcount.find(ind_word)==wordcount.end()) + return 0; + else + return wordcount[ind_word]; +} + +COUNT_DATA_TYPE +LexTorData::get_lexchoice_sum(const wstring& lexical_choice) { + return lexchoice_sum[word2index[StringUtils::tolower(lexical_choice)]]; +} + +void +LexTorData::set_lexchoice_sum(const wstring& lexical_choice, COUNT_DATA_TYPE sum) { + lexchoice_sum[word2index[StringUtils::tolower(lexical_choice)]]=sum; +} + +bool +LexTorData::is_stopword(const wstring& word) { + return (stopwords.find(StringUtils::tolower(word))!=stopwords.end()); +} + +void +LexTorData::read(FILE *is) { + //cerr<<"LexTorData::read------------------------------------\n"; + n_stopwords=(WORD_DATA_TYPE)Compression::multibyte_read(is); + n_words=(WORD_DATA_TYPE)Compression::multibyte_read(is); + n_words_per_set=(WORD_DATA_TYPE)Compression::multibyte_read(is); + n_set=(WORD_DATA_TYPE)Compression::multibyte_read(is); + + //cerr<::iterator it; + for (it=stopwords.begin(); it!=stopwords.end(); it++) { + Compression::wstring_write(*it, os); + } + + //Write the list of words + //cerr<<"list of words----------------------------------------\n"; + for(unsigned int i=1; i >::iterator it_lch_set; + map::iterator it_w_lch_set; + //map::iterator it_lch_prob; + + for(it_lch_set=lexchoice_set.begin(); it_lch_set!=lexchoice_set.end(); it_lch_set++) { + WORD_DATA_TYPE lexchoice=it_lch_set->first; + COUNT_DATA_TYPE sum=lexchoice_sum[lexchoice]; + //double prob=lexchoice_prob[lexchoice]; + + //cerr<<"lexchoice: "< (&prob), sizeof(double)); + EndianDoubleUtil::write(os, sum); + + int nwritten_words=0; + for(it_w_lch_set=it_lch_set->second.begin(); + it_w_lch_set!=it_lch_set->second.end(); + it_w_lch_set++) { + WORD_DATA_TYPE word=it_w_lch_set->first; + COUNT_DATA_TYPE count=it_w_lch_set->second; + //cerr<<" word: "<::iterator sit; + for(sit=words.begin(); sit!=words.end(); sit++) { + WORD_DATA_TYPE word=word2index[*sit]; + Compression::multibyte_write(word, os); + //cerr<<"word: "<<*sit<<"\n"; + } +} + +void +LexTorData::read_stopwords(wistream& is) { + while (!is.eof()) { + wstring w; + getline(is,w); + w=StringUtils::tolower(w); + if (w.length()>0) { + stopwords.insert(w); + wcerr<0) { + words.insert(w); + new_word_register(w); + } + } + n_set=words.size(); + wcerr<::iterator it; + int nlexchoices=0; + + for(it=words.begin(); it!=words.end(); it++) { + LexTorWord ambiguousword(*it, &fstp); + nlexchoices+=ambiguousword.n_lexical_choices(); + + for(int i=0; i +LexTorData::get_words() { + return words; +} + +set +LexTorData::get_lexical_choices(const wstring& word) { + return lexical_choices[StringUtils::tolower(word)]; +} + +void +LexTorData::set_nwords_per_set(int i){ + n_words_per_set=i; + wcerr< >& context) { + wcerr<::iterator its, itw; + set swaux; + + //Notice that stopwords consist of lemma and first tag while words + //consist of lemma and one (the first one) or more tags + + for(its=stopwords.begin(); its!=stopwords.end(); its++) { + bool is_ok=true; + for(itw=words.begin(); itw!=words.end(); itw++) { + //cerr<<"sw: "<<*its<<" w: "<<*itw<<"\n"; + if (itw->find(*its)==0) { + wcerr<0) && (s[0]=='^') && (s[s.length()-1]=='$')) + str=StringUtils::tolower(s.substr(1, s.length()-1)); + else + str=StringUtils::tolower(s); + + set::iterator it; + for(it=words.begin(); it!=words.end(); it++) { + if (str.find(*it)==0) { + return (*it); + } + } + + unsigned int p=str.find(L">"); + unsigned int i=0; + if (p==static_cast(wstring::npos)) { //s could correspond to an unknown word + p=str.length(); + if ((str.length()>0) && (str[0]=='*')) + i=1; // to remove the star (unknown word mark) + } + else + p++; + + if (i>=p) { + wcerr<0) && (s[0]=='^') && (s[s.length()-1]=='$')) + str=StringUtils::tolower(s.substr(1, s.length()-1)); + else + str=StringUtils::tolower(s); + + set::iterator it; + for(it=reduced_lexical_choices.begin(); it!=reduced_lexical_choices.end(); it++) { + if (str.find(*it)==0) { + return (*it); + } + } + + //return StringUtils::substitute(str," d<", " D<"); + + return str; +} + +void +LexTorData::new_word_register(const wstring& word) { + wstring w=StringUtils::tolower(word); + + if (word2index.find(w)==word2index.end()) { + index2word.push_back(w); + int ind=index2word.size()-1; + if (ind>MAX_WORD_INDEX) { + wcerr< > +LexTorData::get_cooccurrence_vector(const string& lexical_choice) { + vector > v; + WORD_DATA_TYPE ind_lexchoice=word2index[StringUtils::tolower(lexical_choice)]; + map::iterator it; + + for(it=lexchoice_set[ind_lexchoice].begin(); it!= lexchoice_set[ind_lexchoice].end(); it++) + v.push_back(*it); + + return v; +} +*/ + + +double +LexTorData::get_module_lexchoice_vector(const wstring& lexical_choice) { + WORD_DATA_TYPE ind_lexchoice=word2index[StringUtils::tolower(lexical_choice)]; + map::iterator it; + + double module=0; + + for(it=lexchoice_set[ind_lexchoice].begin(); it!= lexchoice_set[ind_lexchoice].end(); it++) + module+=(it->second)*(it->second); + + module=sqrt(module); + + return module; +} + +double +LexTorData::cosine(const wstring& reduced_lexch1, const wstring& reduced_lexch2) { + WORD_DATA_TYPE ind_lexchoice1=word2index[StringUtils::tolower(reduced_lexch1)]; + WORD_DATA_TYPE ind_lexchoice2=word2index[StringUtils::tolower(reduced_lexch2)]; + map::iterator it; + + //We calculate the scalar product + double scalar_product=0; + for(it=lexchoice_set[ind_lexchoice1].begin(); it!= lexchoice_set[ind_lexchoice1].end(); it++) { + if (lexchoice_set[ind_lexchoice2].find(it->first)!= + lexchoice_set[ind_lexchoice2].end()) { + scalar_product+=(it->second)*lexchoice_set[ind_lexchoice2][it->first]; + } + } + + //We get the module of the lexchoice vectors, ||lexchoice vector|| + double module_lexch1_vector=get_module_lexchoice_vector(reduced_lexch1); + double module_lexch2_vector=get_module_lexchoice_vector(reduced_lexch2); + + + if (module_lexch1_vector==0) { + if (LexTor::debug) { + wcerr<. + */ +#ifndef __LEXTORDATA_H +#define __LEXTORDATA_H + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define WORD_DATA_TYPE unsigned short +#define MAX_WORD_INDEX (pow(2.0,(double)(sizeof(WORD_DATA_TYPE)*8))-1) + +#define COUNT_DATA_TYPE double + +#define NULLWORD L"NULLWORD" + +using namespace std; + +/** Class LexTorData. (Lexical Selector Data class) + */ + +class LexTorData{ +private: + + WORD_DATA_TYPE n_stopwords; + WORD_DATA_TYPE n_words; + WORD_DATA_TYPE n_words_per_set; + WORD_DATA_TYPE n_set; + + //For a give word (or lexical choice) its index is returned and vice versa + map word2index; + vector index2word; + + map wordcount; + + //For a given lexical choice it contains the set of words it appears + //with, and for each co-appearing word, the number of times they + //co-appear + map > lexchoice_set; + + //For a given lexical choice it contains the sum of all co-appearing words + map lexchoice_sum; + + //For a given lexical choice it contains its probability + //map lexchoice_prob; + + //Set of stopwords + set stopwords; + + //Set of words to work with + set words; + + //For a given word it contains its set of lexical-choices (when available) + map > lexical_choices; + + set reduced_lexical_choices; + + void new_word_register(const wstring& w); +public: + + LexTorData(); + + LexTorData(const LexTorData& ltd); + + ~LexTorData(); + + COUNT_DATA_TYPE vote_from_word(const wstring& lexical_choice, const wstring& word); + + //double get_lexchoice_prob(const string& lexical_choice); + + COUNT_DATA_TYPE get_lexchoice_sum(const wstring& lexical_choice); + + void set_wordcount(const wstring& word, COUNT_DATA_TYPE c); + COUNT_DATA_TYPE get_wordcount(const wstring& word); + + void set_lexchoice_sum(const wstring& lexical_choice, COUNT_DATA_TYPE sum); + + bool is_stopword(const wstring& word); + + void read(FILE *is); + + void write(FILE *os); + + void read_stopwords(wistream& is); + + void read_words(wistream& is); + + void read_lexical_choices(FSTProcessor& fstp); + + void set_nwords_per_set(int i); + + void set_cooccurrence_context(const wstring& lexical_choice, + const vector >& context); + + //vector > + //get_cooccurrence_vector(const string& lexical_choice); + double get_module_lexchoice_vector(const wstring& lexical_choice); + + double cosine(const wstring& reduced_lexch1, const wstring& reduced_lexch2); + + set get_words(); + + set get_lexical_choices(const wstring& word); + + //Used to ensure that none of the stopwords are in the set + //of words from which co-occurrence models are being estimated + void ensure_stopwords_ok(); + + //Given a word in the apertium format the lemma and the fisrt tag + //are returned (both in lower case) if possible + wstring reduce(const wstring& s); + + wstring reduce_lexical_choice(const wstring& s); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/lextor_eval.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/lextor_eval.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lextor_eval.cc (revision 69632) @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2004-2006 Felipe Sánchez-Martínez + * Copyright (C) 2006 Universitat d'Alacant + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +using namespace Apertium; +LexTorEval::LexTorEval(wistream* iref) { + nwords=0; + //nunknown=0; + nignored=0; + npol=0; + //nerrors_nopol=0; + nerrors_pol=0; + //nerrors_unk=0; + + ndefault=0; + + refer=iref; + + //words2ignore.insert(); + words2ignore.insert(L"as"); + words2ignore.insert(L"at"); + words2ignore.insert(L"before"); + words2ignore.insert(L"but"); + words2ignore.insert(L"by"); + words2ignore.insert(L"for"); + words2ignore.insert(L"how"); + words2ignore.insert(L"in"); + words2ignore.insert(L"just"); + words2ignore.insert(L"off"); + words2ignore.insert(L"on"); + words2ignore.insert(L"over"); + words2ignore.insert(L"right"); + words2ignore.insert(L"since"); + words2ignore.insert(L"whether"); +} + +LexTorEval::~LexTorEval() { +} + +void +LexTorEval::print_evaluation() { + wcerr<::iterator it; + wcerr<first<second<first]<first]<first]/it->second)*100<first]/it->second)*100<reduce(ltword.get_lexical_choice(winner,false)); + wstring word=lextor_data->reduce(ltword.get_word_string()); + wstring wref; + wstring reduced_wref; + bool ignore=false; + + getline(*refer,wref); + + // if (words2ignore.find(word)!=words2ignore.end()) { + // return; + //} + + if (wref.find(L">__IGNORE") != wstring::npos) + ignore=true; + + if (!ignore) { + nwords+=1.0; + reduced_wref=lextor_data->reduce(wref); + if (ltword. n_lexical_choices()>1) { + npol+=1.0; + nwords_per_word[word]+=1.0; + if (winner<0) { + ndefault+=1.0; + ndefault_per_word[word]+=1.0; + } + if (reduced_w!=reduced_wref) { + nerrors_pol+=1.0; + nerrors_per_word[word]+=1.0; + if (LexTor::debug) { + wcerr<. + */ + +#ifndef __LEXTOR_EVAL_H +#define __LEXTOR_EVAL_H + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +class LexTorEval { +private: + + double nwords; + //double nunknown; + double nignored; + double npol; + //double nerrors_nopol; + double nerrors_pol; + //double nerrors_unk; + + double ndefault; + + map nwords_per_word; + map nerrors_per_word; + map ndefault_per_word; + + wistream* refer; + + set words2ignore; +public: + + LexTorEval(wistream *iref); + + ~LexTorEval(); + + void evalword(LexTorWord& ltword, int winner, LexTorData* lextor_data); + + void print_evaluation(); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/lextor_word.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/lextor_word.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/lextor_word.cc (revision 69632) @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante + * author: Felipe Sánchez-Martínez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include + +using namespace Apertium; +LexTorWord::LexTorWord() { + ignored_string = L""; + word = L""; + default_choice = 0; +} + +LexTorWord::LexTorWord(const LexTorWord& ltw) { + word=ltw.word; + ignored_string=ltw.ignored_string; + lexical_choices=ltw.lexical_choices; + default_choice=ltw.default_choice; +} + +LexTorWord::LexTorWord(const wstring &str, FSTProcessor *fstp) { + word=str; + ignored_string=L""; + extract_lexical_choices(fstp); +} + +LexTorWord::~LexTorWord() { +} + +wstring +LexTorWord::get_word_string() { + return word; +} + +int +LexTorWord::n_lexical_choices() { + return lexical_choices.size(); +} + +wstring +LexTorWord::get_lexical_choice(int choice, bool include_ignored) { + if (word == L"") { + if (include_ignored) + return ignored_string; + else + return L""; + } + + if (choice<0) + choice=default_choice; + + if (choice>=(int)lexical_choices.size()) { + wcerr<=(int)lexical_choices.size()) { + wcerr<biltrans(word,false), L"/"); + default_choice=0; + + if (lexical_choices.size()>1) { //lexically ambiguous word + for(unsigned int i=0; i(string::npos)) { + if (!((lexical_choices[i].length()>p+2) && (lexical_choices[i][p+2]=='<'))) { + wcerr<biltrans(word,false)<>c; + + if (is.fail()) { + if (reading_word) { + wcerr<0)||(w.ignored_string.length()>0)) { + if(fstp!=NULL) + w.extract_lexical_choices(fstp); + return new LexTorWord(w); + } else + return NULL; + } + } + + if ((c==L'^') && (prev_c!=L'\\') && (!reading_word)) { + reading_word=true; + } else if ((c==L'$') && (prev_c!=L'\\') && (reading_word)) { + finish=true; + } else { + if (reading_word) + w.word+=c; + else + w.ignored_string+=c; + } + prev_c=c; + } + + if ((w.word.length()==0) && (w.ignored_string.length()==0)) + return NULL; + + if(fstp!=NULL) + w.extract_lexical_choices(fstp); + + /* + cerr<<"word: "<. + */ +#ifndef __LEXTORWORD_H +#define __LEXTORWORD_H + +#include +#include +#include +#include +#include + +#include +#include + +using namespace std; + +/** Class LexTorWord. (Lexical Selector Word) + */ + +class LexTorWord{ +private: + wstring word; + wstring ignored_string; + vector lexical_choices; + int default_choice; + + void extract_lexical_choices(FSTProcessor *fstp); +public: + + LexTorWord(); + + LexTorWord(const LexTorWord& ltw); + + LexTorWord(const wstring& str, FSTProcessor *fstp); + + ~LexTorWord(); + + /** Return the lexical choice at position 'choice', if 'choice' is not + * given the default one is returned + */ + wstring get_lexical_choice(int choice=-1, bool include_ignored=true); + + /** Returns the number of lexical choices for this word + */ + int n_lexical_choices(); + + wstring get_word_string(); + + wstring translate(FSTProcessor& bildic, int choice=-1); + + + /** When calling this method the set of lexical choice for each word + * will be extracted from the FSTProcessor object if present. + * Moreover the input stream (is) is supossed to be in the + * intermediate format used by the apertium MT system. + */ + static LexTorWord* next_word(wistream& is, FSTProcessor *fstp=NULL); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/morpho_stream.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/morpho_stream.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/morpho_stream.cc (revision 69632) @@ -0,0 +1,402 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/** + * Word class and MorphoStream class definitions + * + * @author Felipe Sánchez-Martínez + */ + +#include +#include +#include +#include +#include "apertium_config.h" +#include + +using namespace Apertium; +MorphoStream::MorphoStream(FILE *ftxt, bool d, TaggerData *t) +{ + foundEOF = false; + debug=d; + td = t; + me = td->getPatternList().newMatchExe(); + alphabet = td->getPatternList().getAlphabet(); + input = ftxt; + ca_any_char = alphabet(PatternList::ANY_CHAR); + ca_any_tag = alphabet(PatternList::ANY_TAG); + + ConstantManager &constants = td->getConstants(); + ca_kignorar = constants.getConstant(L"kIGNORAR"); + ca_kbarra = constants.getConstant(L"kBARRA"); + ca_kdollar = constants.getConstant(L"kDOLLAR"); + ca_kbegin = constants.getConstant(L"kBEGIN"); + ca_kmot = constants.getConstant(L"kMOT"); + ca_kmas = constants.getConstant(L"kMAS"); + ca_kunknown = constants.getConstant(L"kUNKNOWN"); + + map &tag_index = td->getTagIndex(); + ca_tag_keof = tag_index[L"TAG_kEOF"]; + ca_tag_kundef = tag_index[L"TAG_kUNDEF"]; + + end_of_file = false; + null_flush = false; +} + +MorphoStream::~MorphoStream() +{ + delete me; +} + +TaggerWord * +MorphoStream::get_next_word() +{ + if(vwords.size() != 0) + { + TaggerWord* word=vwords.front(); + vwords.erase(vwords.begin()); + + if(word->isAmbiguous()) + { + vector &ref = td->getDiscardRules(); + for(unsigned int i = 0; i < ref.size(); i++) + { + word->discardOnAmbiguity(ref[i]); + } + } +// cout << *word << endl; + return word; + } + + if(feof(input)) + { + return NULL; + } + + int ivwords = 0; + vwords.push_back(new TaggerWord()); + + while(true) + { + int symbol = fgetwc_unlocked(input); + if(feof(input) || (null_flush && symbol == L'\0')) + { + end_of_file = true; + vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + return get_next_word(); + } + if(symbol == L'^') + { + readRestOfWord(ivwords); + return get_next_word(); + } + else + { + wstring str = L""; + if(symbol == L'\\') + { + symbol = fgetwc_unlocked(input); + str += L'\\'; + str += static_cast(symbol); + symbol = L'\\'; + } + else + { + str += static_cast(symbol); + } + + while(symbol != L'^') + { + symbol = fgetwc_unlocked(input); + if(feof(input) || (null_flush && symbol == L'\0')) + { + end_of_file = true; + vwords[ivwords]->add_ignored_string(str); + vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + return get_next_word(); + } + else if(symbol == L'\\') + { + str += L'\\'; + symbol = fgetwc_unlocked(input); + if(feof(input) || (null_flush && symbol == L'\0')) + { + end_of_file = true; + vwords[ivwords]->add_ignored_string(str); + vwords[ivwords]->add_tag(ca_tag_keof, L"", td->getPreferRules()); + return get_next_word(); + } + str += static_cast(symbol); + symbol = L'\\'; + } + else if(symbol == L'^') + { + if(str.size() > 0) + { + vwords[ivwords]->add_ignored_string(str); + } + readRestOfWord(ivwords); + return get_next_word(); + } + else + { + str += static_cast(symbol); + } + } + } + } +} + +void +MorphoStream::lrlmClassify(wstring const &str, int &ivwords) +{ + int floor = 0; + int last_type = -1; + int last_pos = 0; + + ms.init(me->getInitial()); + for(int i = 0, limit = str.size(); i != limit; i++) + { + if(str[i] != L'<') + { + if(str[i] == L'+') + { + int val = ms.classifyFinals(me->getFinals()); + if(val != -1) + { + last_pos = i-1; + last_type = val; + } + } + ms.step(towlower(str[i]), ca_any_char); + } + else + { + wstring tag = L""; + for(int j = i+1; j != limit; j++) + { + if(str[j] == L'\\') + { + j++; + } + else if(str[j] == L'>') + { + tag = str.substr(i, j-i+1); + i = j; + break; + } + } + + int symbol = alphabet(tag); + if(symbol) + { + ms.step(symbol, ca_any_tag); + } + else + { + ms.step(ca_any_tag); + } + } + + if(ms.size() == 0) + { + if(last_pos != floor) + { + vwords[ivwords]->add_tag(last_type, + str.substr(floor, last_pos - floor + 1), + td->getPreferRules()); + if(str[last_pos+1] == L'+' && last_pos+1 < limit ) + { + floor = last_pos + 1; + last_pos = floor; + vwords[ivwords]->set_plus_cut(true); + if (((int)vwords.size())<=((int)(ivwords+1))) + vwords.push_back(new TaggerWord(true)); + ivwords++; + ms.init(me->getInitial()); + } + i = floor++; + } + else + { + if (debug) + { + wcerr<add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); + return; + } + } + else if(i == limit - 1) + { + if(ms.classifyFinals(me->getFinals()) == -1) + { + if(last_pos != floor) + { + vwords[ivwords]->add_tag(last_type, + str.substr(floor, last_pos - floor + 1), + td->getPreferRules()); + if(str[last_pos+1] == L'+' && last_pos+1 < limit ) + { + floor = last_pos + 1; + last_pos = floor; + vwords[ivwords]->set_plus_cut(true); + if (((int)vwords.size())<=((int)(ivwords+1))) + vwords.push_back(new TaggerWord(true)); + ivwords++; + ms.init(me->getInitial()); + } + i = floor++; + } + else + { + if (debug) + { + wcerr<add_tag(ca_tag_kundef, str.substr(floor) , td->getPreferRules()); + return; + } + } + } + } + + int val = ms.classifyFinals(me->getFinals()); + if(val == -1) + { + val = ca_tag_kundef; + if (debug) + { + wcerr<add_tag(val, str.substr(floor), td->getPreferRules()); +} + +void +MorphoStream::readRestOfWord(int &ivwords) +{ + // first we have the superficial form + wstring str = L""; + + while(true) + { + int symbol = fgetwc_unlocked(input); + if(feof(input) || (null_flush && symbol == L'\0')) + { + end_of_file = true; + if(str.size() > 0) + { + vwords[ivwords]->add_ignored_string(str); + wcerr<get_superficial_form()<add_tag(ca_tag_keof, L"", td->getPreferRules()); + return; + } + else if(symbol == L'\\') + { + symbol = fgetwc_unlocked(input); + str += L'\\'; + str += static_cast(symbol); + } + else if(symbol == L'/') + { + vwords[ivwords]->set_superficial_form(str); + str = L""; + break; + } + else if(symbol == L'$') + { + vwords[ivwords]->set_superficial_form(str); + vwords[ivwords]->add_ignored_string(L"$"); + break; + } + else + { + str += static_cast(symbol); + } + } + + // then we read the acceptions + + while(true) + { + int symbol = fgetwc_unlocked(input); + if(feof(input) || (null_flush && symbol == L'\0')) + { + end_of_file = true; + if(str.size() > 0) + { + vwords[ivwords]->add_ignored_string(str); + wcerr<get_superficial_form()<add_tag(ca_tag_keof, L"", td->getPreferRules()); + return; + } + else if(symbol == L'\\') + { + symbol = fgetwc_unlocked(input); + str += L'\\'; + str += static_cast(symbol); + symbol = L'\\'; // to prevent exiting with '\$' + } + else if(symbol == L'/') + { + lrlmClassify(str, ivwords); + str = L""; + ivwords = 0; + continue; + } + else if(symbol == L'$') + { + if(str[0] != L'*')// do nothing with unknown words + { + lrlmClassify(str, ivwords); + } + return; + } + else + { + str += static_cast(symbol); + } + } +} + +void +MorphoStream::setNullFlush(bool nf) +{ + null_flush = nf; +} + +bool +MorphoStream::getEndOfFile(void) +{ + return end_of_file; +} + +void +MorphoStream::setEndOfFile(bool eof) +{ + end_of_file = eof; +} Index: branches/apertium-tagger/apertium2/apertium/morpho_stream.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/morpho_stream.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/morpho_stream.h (revision 69632) @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +/** + * Word class and MorphoStream class definitions + * + * @author Felipe Sánchez-Martínez + */ + +#ifndef __MORPHOSTREAM_H +#define __MORPHOSTREAM_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +/** Class MorphoStream. + * This class processes the output of class yyFlexLexer (lex.yy.cc), and + * builds the TaggerWord objects managed by the tagger + */ +class MorphoStream { +private: + bool foundEOF; + wstring last_string_tag; + bool debug; + FILE *input; + int ca_any_char; + int ca_any_tag; + int ca_kignorar; + int ca_kbarra; + int ca_kdollar; + int ca_kbegin; + int ca_kmot; + int ca_kmas; + int ca_kunknown; + int ca_tag_keof; + int ca_tag_kundef; + + vector vwords; //Vector used to implement a buffer + //to treat ambiguous multiword units + + MatchExe *me; + TaggerData *td; + Alphabet alphabet; + MatchState ms; + + bool null_flush; + bool end_of_file; + + void readRestOfWord(int &ivwords); + void lrlmClassify(wstring const &str, int &ivwords); +public: + + /** Constructor + * @param is the input stream. + */ + MorphoStream(FILE *ftxt, bool d, TaggerData *t); + + /** + * Destructor + */ + ~MorphoStream(); + + /** Get next word in the input stream + * @return A pointer to the next word in the input stream + */ + TaggerWord* get_next_word(); + + /** + * Set up the flag to detect '\0' characters + * @param nf the null_flush value + */ + void setNullFlush(bool nf); + + /** + * Return true if the last reading is end of file of '\0' when null_flush + * is true + * @returns the value of end_of_file + */ + bool getEndOfFile(void); + + /** + * Sets a new value for the end_of_file_flag + * @param eof the new value for end_of_file + */ + void setEndOfFile(bool eof); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/new2old.xsl =================================================================== --- branches/apertium-tagger/apertium2/apertium/new2old.xsl (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/new2old.xsl (revision 69632) @@ -0,0 +1,151 @@ + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
\ No newline at end of file Index: branches/apertium-tagger/apertium2/apertium/postchunk.dtd =================================================================== --- branches/apertium-tagger/apertium2/apertium/postchunk.dtd (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/postchunk.dtd (revision 69632) @@ -0,0 +1,434 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/reformat.xsl =================================================================== --- branches/apertium-tagger/apertium2/apertium/reformat.xsl (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/reformat.xsl (revision 69632) @@ -0,0 +1,237 @@ + + + + + + + +%{ + +#ifndef GENFORMAT +#include "apertium_config.h" +#endif +#include <apertium/unlocked_cstdio.h> + +#include <cstdlib> +#include <iostream> +#include <libgen.h> +#include <map> +#include <string> +#include <unistd.h> +#include <lttoolbox/lt_locale.h> +#include <lttoolbox/ltstr.h> +#include <wchar.h> +#ifdef _WIN32 +#include <io.h> +#include <fcntl.h> +#endif + +using namespace std; + + + + + + + + + + + + + + + + + + + + + + + + +string memconv; + +wstring convertir(char const *multibyte, int const length) +{ + memconv.append(multibyte, length); + int tam = memconv.size(); + if (memconv == "") + return L""; + wchar_t *retval = new wchar_t[tam+1]; + size_t l = mbstowcs(retval, memconv.c_str(), tam); + + if(l == ((size_t) -1)) + { + if(memconv.size() >= 4) + { + wcerr << L"Warning: wrong encoding" << endl; + } + if (retval != NULL) + delete[] retval; + return L""; + } + else + { + memconv = ""; + retval[l] = 0; + wstring ret = retval; + if (retval != NULL) + delete[] retval; + return ret; + } +} + +%} + +%option nounput +%option noyywrap +%option caseless + +%% + +"["|"]" { + // do nothing +} + +"[@"[^]]+"]" { + string filename = yytext; + filename = filename.substr(2, filename.size()-3); + FILE *temp = fopen(filename.c_str(), "r"); + wint_t mychar; +#ifdef _WIN32 + _setmode(_fileno(temp), _O_U8TEXT); +#endif + + if(!temp) + { + cerr << "ERROR: File '" << filename <<"' not found." << endl; + exit(EXIT_FAILURE); + } + while(static_cast<int>(mychar = fgetwc_unlocked(temp)) != EOF) + { + fputwc_unlocked(mychar, yyout); + } + fclose(temp); + unlink(filename.c_str()); +} + +"[\\@" { + fputwc_unlocked(L'@', yyout); +} + +".[]" { + // do nothing +} + +"\\" { + fputws_unlocked(convertir(yytext+1, yyleng-1).c_str(), yyout); +} + + + +.|\n { + wstring yytext_conv = convertir(yytext, yyleng); + + + + + + + + + + + + + + + + + + + + + + + + + +} + +<<EOF>> { + return 0; +} + +%% + +void usage(string const &progname) +{ + cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; + cerr << " format processor " << endl; + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + + if(argc > 3) + { + usage(argv[0]); + } + + switch(argc) + { + case 3: + yyout = fopen(argv[2], "w"); + if(!yyout) + { + usage(argv[0]); + } + case 2: + yyin = fopen(argv[1], "r"); + if(!yyin) + { + usage(argv[0]); + } + break; + default: + break; + } +#ifdef _WIN32 + _setmode(_fileno(yyin), _O_U8TEXT); + _setmode(_fileno(yyout), _O_U8TEXT); +#endif + + + + + + + + yylex(); + fclose(yyin); + fclose(yyout); +} + + Index: branches/apertium-tagger/apertium2/apertium/string_utils.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/string_utils.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/string_utils.cc (revision 69632) @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2006 Universitat d'Alacant / Universidad de Alicante + * author: Felipe Sánchez-Martínez + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include + +#ifdef _MSC_VER +#define snprintf _snprintf +#endif + +//Delete white spaces from the end and the begining of the string +wstring +StringUtils::trim(wstring const &str) +{ + if(str == L"") + { + return L""; + } + + int begin = 0, end = str.size() - 1; + + while(begin < end && iswspace(str[begin])) + { + begin++; + } + + while(end > begin && iswspace(str[end])) + { + end--; + } + + if(!iswspace(str[end])) + { + end++; + } + + return str.substr(begin, end-begin); +} + +vector +StringUtils::split_wstring(wstring const &input, wstring const &delimiter) +{ + unsigned pos; + int new_pos; + vector result; + wstring s = L""; + pos=0; + + while(pos const &v) +{ + wstring s = L""; + for(unsigned i=0; i0) + s+=L' '; + s.append(v[i]); + } + return s; +} + +wstring +StringUtils::substitute(wstring const &source, wstring const &olds, wstring const &news) { + wstring s = source; + + unsigned int p=s.find(olds , 0); + while (p!=static_cast(wstring::npos)) + { + s.replace(p, olds.length(), news); + p+=news.length(); + p=s.find(olds,p); + } + + return s; +} + +wstring +StringUtils::itoa(int n) +{ + return XMLParseUtil::stows(itoa_string(n)); +} + +string +StringUtils::itoa_string(int n) +{ + char str[256]; + snprintf(str, 256, "%d", n); + return str; +} + +wstring +StringUtils::ftoa(double f) +{ + char str[256]; + sprintf(str, "%f",f); + return XMLParseUtil::stows(str); +} + +wstring +StringUtils::tolower(wstring const &s) +{ + wstring l=s; + for(unsigned i=0; i. + */ +#ifndef __STRINGUTILS_H_ +#define __STRINGUTILS_H_ + +#include +#include +#include + +using namespace std; + +namespace Apertium +{ + bool operator==(string const &s1, string const &s2); + bool operator==(string const &s1, char const *s2); + bool operator==(char const *s1, string const &s2); + bool operator!=(string const &s1, string const &s2); + bool operator!=(string const &s1, char const *s2); + bool operator!=(char const *s1, string const &s2); +} + +class StringUtils { + public: + + static wstring trim(wstring const &str); + + static vector split_wstring(wstring const &input, wstring const &delimiter); + + static wstring vector2wstring(vector const &v); + + //Replace each ocurrence of the string 'olds' by the string 'news' in string 'source' + static wstring substitute(const wstring &source, const wstring &olds, const wstring &news); + + static wstring itoa(int n); + + static string itoa_string(int n); + + static wstring ftoa(double f); + + static wstring tolower(wstring const &s); + + static wstring toupper(wstring const &s); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/tagger.dtd =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger.dtd (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger.dtd (revision 69632) @@ -0,0 +1,157 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/tagger_data.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_data.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_data.cc (revision 69632) @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include + +using namespace Apertium; + +void +TaggerData::copy(TaggerData const &o) +{ + open_class = o.open_class; + forbid_rules = o.forbid_rules; + tag_index = o.tag_index; + array_tags = o.array_tags; + enforce_rules = o.enforce_rules; + prefer_rules = o.prefer_rules; + constants = o.constants; + output = o.output; + plist = o.plist; +} + +TaggerData::TaggerData() +{ +} + +TaggerData::~TaggerData() +{ +} + +TaggerData::TaggerData(TaggerData const &o) +{ + copy(o); +} + +TaggerData & +TaggerData::operator =(TaggerData const &o) +{ + if(this != &o) + { + copy(o); + } + return *this; +} + +set & +TaggerData::getOpenClass() +{ + return open_class; +} + +void +TaggerData::setOpenClass(set const &oc) +{ + open_class = oc; +} + +vector & +TaggerData::getForbidRules() +{ + return forbid_rules; +} + +void +TaggerData::setForbidRules(vector &fr) +{ + forbid_rules = fr; +} + +map & +TaggerData::getTagIndex() +{ + return tag_index; +} + +void +TaggerData::setTagIndex(map const &ti) +{ + tag_index = ti; +} + +vector & +TaggerData::getArrayTags() +{ + return array_tags; +} + +void +TaggerData::setArrayTags(vector const &at) +{ + array_tags = at; +} + +vector & +TaggerData::getEnforceRules() +{ + return enforce_rules; +} + +void +TaggerData::setEnforceRules(vector const &tear) +{ + enforce_rules = tear; +} + +vector & +TaggerData::getPreferRules() +{ + return prefer_rules; +} + +void +TaggerData::setPreferRules(vector const &pr) +{ + prefer_rules = pr; +} + +vector & +TaggerData::getDiscardRules() +{ + return discard; +} + +void +TaggerData::setDiscardRules(vector const &v) +{ + discard = v; +} + +ConstantManager & +TaggerData::getConstants() +{ + return constants; +} + +void +TaggerData::setConstants(ConstantManager const &c) +{ + constants = c; +} + +Collection & +TaggerData::getOutput() +{ + return output; +} + +void +TaggerData::setOutput(Collection const &c) +{ + output = c; +} + +PatternList & +TaggerData::getPatternList() +{ + return plist; +} + +void +TaggerData::setPatternList(PatternList const &pl) +{ + plist = pl; +} + +void +TaggerData::addDiscard(wstring const &tags) +{ + discard.push_back(tags); +} Index: branches/apertium-tagger/apertium2/apertium/tagger_data.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_data.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_data.h (revision 69632) @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TAGGERDATA_ +#define _TAGGERDATA_ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace std; + +class TaggerData +{ +protected: + set open_class; + vector forbid_rules; + map tag_index; + vector array_tags; + vector enforce_rules; + vector prefer_rules; + ConstantManager constants; + Collection output; + PatternList plist; + + vector discard; + + void copy(TaggerData const &o); +public: + TaggerData(); + virtual ~TaggerData(); + TaggerData(TaggerData const &o); + TaggerData & operator =(TaggerData const &o); + + set & getOpenClass(); + void setOpenClass(set const &oc); + + vector & getForbidRules(); + void setForbidRules(vector &fr); + + map & getTagIndex(); + void setTagIndex(map const &ti); + + vector & getArrayTags(); + void setArrayTags(vector const &at); + + vector & getEnforceRules(); + void setEnforceRules(vector const &tear); + + vector & getPreferRules(); + void setPreferRules(vector const &pr); + + vector & getDiscardRules(); + void setDiscardRules(vector const &dr); + + ConstantManager & getConstants(); + void setConstants(ConstantManager const &c); + + virtual Collection & getOutput(); + void setOutput(Collection const &c); + + void setPatternList(PatternList const &pl); + void addDiscard(wstring const &tags); + PatternList & getPatternList(); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/tagger_data_hmm.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_data_hmm.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_data_hmm.h (revision 69632) @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TAGGERDATAHMM_ +#define _TAGGERDATAHMM_ + +#include + +class TaggerDataHMM : public TaggerData +{ +private: + int N; + int M; + double **a; + double **b; + + void destroy(); +public: + TaggerDataHMM(); + virtual ~TaggerDataHMM(); + TaggerDataHMM(TaggerDataHMM const &o); + TaggerDataHMM(TaggerData const &o); + TaggerDataHMM & operator =(TaggerDataHMM const &o); + + virtual void setProbabilities(int const myN, int const myM, + double **myA = NULL, double **myB = NULL); + + virtual double ** getA(); + virtual double ** getB(); + virtual int getN(); + virtual int getM(); + + virtual void read(FILE *in); + virtual void write(FILE *out); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/tagger_data_lsw.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_data_lsw.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_data_lsw.cc (revision 69632) @@ -0,0 +1,324 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include + +using namespace Apertium; + +void +TaggerDataLSW::destroy() +{ + if (d != NULL) { + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + delete [] d[i][j]; + } + delete [] d[i]; + } + delete [] d; + } + d = NULL; + + N = 0; +} + +TaggerDataLSW::TaggerDataLSW() +{ + d = NULL; + N = 0; +} + +TaggerDataLSW::~TaggerDataLSW() +{ + destroy(); +} + +TaggerDataLSW::TaggerDataLSW(TaggerDataLSW const &o) +{ + d = NULL; + N = 0; + TaggerData::copy(o); + this->setProbabilities(o.N, o.d); +} + +TaggerDataLSW::TaggerDataLSW(TaggerData const &o) +{ + d = NULL; + N = 0; + TaggerData::copy(o); +} + +TaggerDataLSW & +TaggerDataLSW::operator =(TaggerDataLSW const &o) +{ + if(this != &o) + { + destroy(); + TaggerData::copy(o); + this->setProbabilities(o.N, o.d); + } + return *this; +} + +void +TaggerDataLSW::setProbabilities(int const myN, double ***myD) { + this->destroy(); + N = myN; + if(N != 0) { + d = new double ** [N]; + for (int i = 0; i < N; ++i) { + d[i] = new double * [N]; + for (int j = 0; j < N; ++j) { + d[i][j] = new double [N]; + if (myD != NULL) { + for (int k = 0; k < N; ++k) { + d[i][j][k] = myD[i][j][k]; + } + } + } + } + } else { + d = NULL; + } +} + +double *** +TaggerDataLSW::getD() { + return d; +} + +int +TaggerDataLSW::getN() +{ + return N; +} + +void +TaggerDataLSW::read(FILE *in) +{ + destroy(); + + // open_class + int val = 0; + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + val += Compression::multibyte_read(in); + open_class.insert(val); + } + + // forbid_rules + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + TForbidRule aux; + aux.tagi = Compression::multibyte_read(in); + aux.tagj = Compression::multibyte_read(in); + forbid_rules.push_back(aux); + } + + + // array_tags + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + array_tags.push_back(Compression::wstring_read(in)); + } + + // tag_index + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + wstring tmp = Compression::wstring_read(in); + tag_index[tmp] = Compression::multibyte_read(in); + } + + // enforce_rules + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + TEnforceAfterRule aux; + aux.tagi = Compression::multibyte_read(in); + for(int j = Compression::multibyte_read(in); j != 0; j--) + { + aux.tagsj.push_back(Compression::multibyte_read(in)); + } + enforce_rules.push_back(aux); + } + + // prefer_rules + for(int i = Compression::multibyte_read(in); i != 0; i--) + { + prefer_rules.push_back(Compression::wstring_read(in)); + } + + // constants + constants.read(in); + + // output + output.read(in); + + // dimensions + N = Compression::multibyte_read(in); + + d = new double ** [N]; + for ( int i = 0; i < N; ++i) { + d[i] = new double * [N]; + for (int j = 0; j < N; ++j) { + d[i][j] = new double [N]; + } + } + + // initializing d matrix + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + for (int k = 0; k < N; ++k) { + d[i][j][k] = 0; + } + } + } + + int nval = Compression::multibyte_read(in); + for(; nval != 0; nval--) { + int i = Compression::multibyte_read(in); + int j = Compression::multibyte_read(in); + int k = Compression::multibyte_read(in); + d[i][j][k] = EndianDoubleUtil::read(in); + } + + // read pattern list + plist.read(in); + + // read discards on ambiguity + discard.clear(); + + int limit = Compression::multibyte_read(in); + if(feof(in)) + { + return; + } + + for(int i = 0; i < limit; i++) + { + discard.push_back(Compression::wstring_read(in)); + } +} + +void +TaggerDataLSW::write(FILE *out) +{ + + // open_class + Compression::multibyte_write(open_class.size(), out); + int val = 0; + for(set::const_iterator it = open_class.begin(), limit = open_class.end(); + it != limit; it++) + { + Compression::multibyte_write(*it-val, out); + val = *it; + } + + // forbid_rules + Compression::multibyte_write(forbid_rules.size(), out); + for(unsigned int i = 0, limit = forbid_rules.size(); i != limit; i++) + { + Compression::multibyte_write(forbid_rules[i].tagi, out); + Compression::multibyte_write(forbid_rules[i].tagj, out); + } + + // array_tags + Compression::multibyte_write(array_tags.size(), out); + for(unsigned int i = 0, limit = array_tags.size(); i != limit; i++) + { + Compression::wstring_write(array_tags[i], out); + } + + // tag_index + Compression::multibyte_write(tag_index.size(), out); + for(map::iterator it = tag_index.begin(), limit = tag_index.end(); + it != limit; it++) + { + Compression::wstring_write(it->first, out); + Compression::multibyte_write(it->second, out); + } + + // enforce_rules + Compression::multibyte_write(enforce_rules.size(), out); + for(unsigned int i = 0, limit = enforce_rules.size(); i != limit; i++) + { + Compression::multibyte_write(enforce_rules[i].tagi, out); + Compression::multibyte_write(enforce_rules[i].tagsj.size(), out); + for(unsigned int j = 0, limit2 = enforce_rules[i].tagsj.size(); j != limit2; j++) + { + Compression::multibyte_write(enforce_rules[i].tagsj[j], out); + } + } + + // prefer_rules + Compression::multibyte_write(prefer_rules.size(), out); + for(unsigned int i = 0, limit = prefer_rules.size(); i != limit; i++) + { + Compression::wstring_write(prefer_rules[i], out); + } + + // constants + constants.write(out); + + // output + output.write(out); + + // d matrix + Compression::multibyte_write(N, out); + + int nval = 0; + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + for (int k = 0; k < N; ++k) { + if (d[i][j][k] > ZERO) { + ++nval; + } + } + } + } + Compression::multibyte_write(nval, out); + + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + for (int k = 0; k < N; ++k) { + if (d[i][j][k] > ZERO) { + Compression::multibyte_write(i, out); + Compression::multibyte_write(j, out); + Compression::multibyte_write(k, out); + EndianDoubleUtil::write(out, d[i][j][k]); + } + } + } + } + + // write pattern list + plist.write(out); + + // write discard list + + if(discard.size() != 0) + { + Compression::multibyte_write(discard.size(), out); + for(unsigned int i = 0, limit = discard.size(); i != limit; i++) + { + Compression::wstring_write(discard[i], out); + } + } +} + Index: branches/apertium-tagger/apertium2/apertium/tagger_data_lsw.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_data_lsw.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_data_lsw.h (revision 69632) @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TAGGERDATALSW_ +#define _TAGGERDATALSW_ + +#include + +class TaggerDataLSW : public TaggerData +{ +private: + int N; + double ***d; + + void destroy(); + +public: + TaggerDataLSW(); + virtual ~TaggerDataLSW(); + TaggerDataLSW(TaggerDataLSW const &o); + TaggerDataLSW(TaggerData const &o); + TaggerDataLSW & operator =(TaggerDataLSW const &o); + + void setProbabilities(int const myN, double ***myD = NULL); + + virtual double *** getD(); + virtual int getN(); + + void read(FILE *in); + void write(FILE *out); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/tagger_word.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tagger_word.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tagger_word.h (revision 69632) @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef __TAGGERWORD_H +#define __TAGGERWORD_H + +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + +/** Class TaggerWord. + * It stores the superficial form and all possible tags that it can receive. + * It has the fine tags delivered by the morphological analyzer and the coarse + * ones used by the PoS tagger. + */ +class TaggerWord{ +private: + wstring superficial_form; + + set tags; //Set of all possible tags + map lexical_forms; //For a given coarse tag it stores the fine tag + //delevered by the morphological analyzer + wstring ignored_string; + + bool plus_cut; //Flag to distinguish the way in which the word was ended. + //If it was done by '$' its value should be false + //If it was done by '+' its value should be true + bool previous_plus_cut; //Flag to distinguish the way in which the + //previous word was ended. It has the same + //plus_cut meaning + bool show_sf; // Show the superficial form in the output + static map patterns; + + bool match(wstring const &s, wstring const &pattern); +public: + static bool generate_marks; + static vector array_tags; + + static bool show_ignored_string; + + /** + * Constructor + */ + TaggerWord(bool prev_plus_cut=false); + + /** + * Copy constructor + */ + TaggerWord(const TaggerWord &w); + + /** + * Destructor + */ + virtual ~TaggerWord(); + + /** Set the superficial form of the word. + * @param s the superficial form + */ + void set_superficial_form(const wstring &s); + + /** Get the superficial form of the word + * + */ + wstring& get_superficial_form(); + + /** Add a new tag to the set of all possible tags of the word. + * @param t the coarse tag + * @param lf the lexical form (fine tag) + */ + virtual void add_tag(TTag &t, const wstring &lf, vector const &prefer_rules); + + /** Get the set of tags of this word. + * @return set of tags. + */ + virtual set& get_tags(); + + /** Get a wstring with the set of tags + */ + virtual wstring get_string_tags(); + + /** Get the lexical form (fine tag) for a given tag (coarse one) + * @param t the tag + * @return the lexical form of tag t + */ + virtual wstring get_lexical_form(TTag &t, int const TAG_kEOF); + + wstring get_all_chosen_tag_first(TTag &t, int const TAG_kEOF); + + /** Get the lexical form (fine tag) for a given tag (coarse one) + * @param t the tag + * @return the lexical form of tag t without other text that + * is ignored. + */ + wstring get_lexical_form_without_ignored_string(TTag &t, int const TAG_kEOF); + + /** Add text to the ignored string + * + */ + void add_ignored_string(wstring const &s); + + /** Set the flag plus_cut to a certain value. If this flag is set to true means + * that there were a '+' between this word and the next one + */ + void set_plus_cut(const bool &c); + + /** + * Get and set the "show superficial form" flag + */ + void set_show_sf(bool sf); + bool get_show_sf(); + + /** Get the value of the plus_cut flag */ + bool get_plus_cut(); + + /** Output operator + */ + friend wostream& operator<< (wostream& os, TaggerWord &w); + + static void setArrayTags(vector const &at); + + void print(); + + void outputOriginal(FILE *output); + + bool isAmbiguous() const; // CAUTION: unknown words are not considered to + // be ambiguous by this method + + void discardOnAmbiguity(wstring const &tags); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/tmx_builder.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_builder.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_builder.h (revision 69632) @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TMXBUILDER_ +#define _TMXBUILDER_ + +#include +#include +#include + +using namespace std; + +class TMXBuilder +{ +private: + wstring lang1; + wstring lang2; + unsigned int max_edit; + unsigned int diagonal_width; + unsigned int window_size; + unsigned int step; + double percent; + double edit_distance_percent; + unsigned int low_limit; + FILE *freference; + + static wstring nextTU(FILE *input); + static wstring restOfBlank(FILE *input); + static wstring nextBlank(FILE *input); + static wstring xmlize(wstring const &str); + static bool compatible(FILE *input, FILE *output, bool lazy = false); + void generateTMX(FILE *f1, FILE *f2, FILE *output); + void outputTU(FILE *f1, FILE *f2, FILE *output); + static vector reverseList(vector const &v); + static vector sentenceList(FILE *file); + static int argmin(int nw, int n, int w); + static int * levenshteinTable(vector &l1, vector &l2, + unsigned int diagonal_width, unsigned int max_edit); + void printTU(FILE *output, wstring const &tu1, wstring const &tu2) const; + static wstring filter(wstring const &s); + static int weight(wstring const &s); + static void printTable(int *table, unsigned int nrows, unsigned int ncols); + static int editDistance(wstring const &s1, wstring const &s2, unsigned int max_edit); + static int min3(int i1, int i2, int i3); + static int min2(int i1, int i2); + void printTUCond(FILE *output, wstring const &s1, wstring const &s2, bool secure_zone); + static vector extractFragment(vector const &text, unsigned int base, + unsigned int width); + + static bool isRemovablePunct(wchar_t const &c); + bool similar(wstring const &s1, wstring const &s2); + + void splitAndMove(FILE *file, string const &filename); +public: + TMXBuilder(wstring const &l1, wstring const &l2); + ~TMXBuilder(); + static bool check(string const &file1, string const &file2, bool lazy = false); + void generate(string const &file1, string const &file2, + string const &outfile=""); + + void setMaxEdit(int me); + void setDiagonalWidth(int dw); + void setWindowSize(int ws); + void setStep(int s); + void setPercent(double p); + void setLowLimit(int l); + void setEditDistancePercent(double e); + void setTranslation(string const &filename); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/transfer_data.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_data.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_data.cc (revision 69632) @@ -0,0 +1,192 @@ +/* + * Copyright (C) 2005--2015 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include + +using namespace Apertium; +using namespace std; + +void +TransferData::copy(TransferData const &o) +{ + alphabet = o.alphabet; + transducer = o.transducer; + finals = o.finals; + attr_items = o.attr_items; + macros = o.macros; + lists = o.lists; + variables = o.variables; +} + +void +TransferData::destroy() +{ +} + +TransferData::TransferData() +{ + // adding fixed attr_items + attr_items[L"lem"] = L"(([^<]|\"\\<\")+)"; + attr_items[L"lemq"] = L"\\#[- _][^<]+"; + attr_items[L"lemh"] = L"(([^<#]|\"\\<\"|\"\\#\")+)"; + attr_items[L"whole"] = L"(.+)"; + attr_items[L"tags"] = L"((<[^>]+>)+)"; + attr_items[L"chname"] = L"({([^/]+)\\/)"; // includes delimiters { and / !!! + attr_items[L"chcontent"] = L"(\\{.+)"; + attr_items[L"content"] = L"(\\{.+)"; +} + +TransferData::~TransferData() +{ + destroy(); +} + +TransferData::TransferData(TransferData const &o) +{ + copy(o); +} + +TransferData & +TransferData::operator =(TransferData const &o) +{ + if(this != &o) + { + destroy(); + copy(o); + } + return *this; +} + +Alphabet & +TransferData::getAlphabet() +{ + return alphabet; +} + +Transducer & +TransferData::getTransducer() +{ + return transducer; +} + +map & +TransferData::getFinals() +{ + return finals; +} + +map & +TransferData::getAttrItems() +{ + return attr_items; +} + +map & +TransferData::getMacros() +{ + return macros; +} + +map, Ltstr> & +TransferData::getLists() +{ + return lists; +} + +map & +TransferData::getVariables() +{ + return variables; +} + +void +TransferData::write(FILE *output) +{ + alphabet.write(output); + transducer.write(output, alphabet.size()); + + // finals + + Compression::multibyte_write(finals.size(), output); + for(map::const_iterator it = finals.begin(), limit = finals.end(); + it != limit; it++) + { + Compression::multibyte_write(it->first, output); + Compression::multibyte_write(it->second, output); + } + + // attr_items + + // precompiled regexps + writeRegexps(output); + + // variables + Compression::multibyte_write(variables.size(), output); + for(map::const_iterator it = variables.begin(), limit = variables.end(); + it != limit; it++) + { + Compression::wstring_write(it->first, output); + Compression::wstring_write(it->second, output); + } + + // macros + Compression::multibyte_write(macros.size(), output); + for(map::const_iterator it = macros.begin(), limit = macros.end(); + it != limit; it++) + { + Compression::wstring_write(it->first, output); + Compression::multibyte_write(it->second, output); + } + + // lists + Compression::multibyte_write(lists.size(), output); + for(map, Ltstr>::const_iterator it = lists.begin(), limit = lists.end(); + it != limit; it++) + { + Compression::wstring_write(it->first, output); + Compression::multibyte_write(it->second.size(), output); + + for(set::const_iterator it2 = it->second.begin(), limit2 = it->second.end(); + it2 != limit2; it2++) + { + Compression::wstring_write(*it2, output); + } + } + +} + +void +TransferData::writeRegexps(FILE *output) +{ + Compression::string_write(string(pcre_version()), output); + Compression::multibyte_write(attr_items.size(), output); + + map::iterator it, limit; + for(it = attr_items.begin(), limit = attr_items.end(); it != limit; it++) + { + Compression::wstring_write(it->first, output); + ApertiumRE my_re; + my_re.compile(UtfConverter::toUtf8(it->second)); + my_re.write(output); + Compression::wstring_write(it->second, output); + } +} Index: branches/apertium-tagger/apertium2/apertium/transfer_data.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_data.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_data.h (revision 69632) @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TRANSFERDATA_ +#define _TRANSFERDATA_ + +#include +#include +#include + +#include +#include + +using namespace std; + +class TransferData +{ +private: + void copy(TransferData const &o); + void destroy(); + + map attr_items; + map macros; + map, Ltstr> lists; + map variables; + + Alphabet alphabet; + Transducer transducer; + map finals; + + void writeRegexps(FILE *output); + public: + TransferData(); + ~TransferData(); + TransferData(TransferData const &o); + TransferData & operator =(TransferData const &o); + + Alphabet & getAlphabet(); + Transducer & getTransducer(); + map & getFinals(); + map & getAttrItems(); + + map & getMacros(); + map, Ltstr> & getLists(); + map & getVariables(); + + void write(FILE *output); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/transfer_instr.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_instr.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_instr.cc (revision 69632) @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include + +using namespace Apertium; +void +TransferInstr::copy(TransferInstr const &o) +{ + type = o.type; + content = o.content; + pos = o.pos; + pointer = o.pointer; + condition = o.condition; +} + +void +TransferInstr::destroy() +{ +} + +TransferInstr::TransferInstr(TransferInstrType t, string const &c, + int const p, void *ptr, bool cond) +{ + type = t; + content = c; + pos = p; + pointer = ptr; + condition = cond; +} + +TransferInstr::~TransferInstr() +{ + destroy(); +} + +TransferInstr::TransferInstr(TransferInstr const &o) +{ + copy(o); +} + +TransferInstr & +TransferInstr::operator =(TransferInstr const &o) +{ + if(this != &o) + { + destroy(); + copy(o); + } + return *this; +} + +TransferInstrType +TransferInstr::getType() +{ + return type; +} + +string const & +TransferInstr::getContent() +{ + return content; +} + +int +TransferInstr::getPos() +{ + return pos; +} + +void * +TransferInstr::getPointer() +{ + return pointer; +} + +bool +TransferInstr::getCondition() +{ + return condition; +} Index: branches/apertium-tagger/apertium2/apertium/transfer_token.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_token.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_token.h (revision 69632) @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TRANSFERTOKEN_ +#define _TRANSFERTOKEN_ + +#include + +using namespace std; + +enum TransferTokenType +{ + tt_eof, + tt_word, + tt_blank +}; + + +class TransferToken +{ +private: + TransferTokenType type; + wstring content; + + void copy(TransferToken const &o); + void destroy(); +public: + TransferToken(); + TransferToken(wstring const &content, TransferTokenType type); + ~TransferToken(); + TransferToken(TransferToken const &o); + TransferToken & operator =(TransferToken const &o); + TransferTokenType getType(); + wstring & getContent(); + void setType(TransferTokenType type); + void setContent(wstring const &content); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/transfer_word.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_word.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_word.h (revision 69632) @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _TRANSFERWORD_ +#define _TRANSFERWORD_ + +#include +#include +#include +#include + +using namespace std; + +/** + * Word type for transfer modules + */ +class TransferWord +{ +private: + /** + * Source language word + */ + string s_str; + + /** + * Target language word + */ + string t_str; + + /** + * Queue length + */ + int queue_length; + + /** + * Copy method + * @param o the object to be copied + */ + void copy(TransferWord const &o); + + /** + * Destroy method + */ + void destroy(); + + /** + * Accesses the source/target side of a word using the specified part + * @param str tipically s_str or t_str + * @param part regular expression to match/access + * @return reference to matched/accessed string + */ + string access(string const &str, ApertiumRE const &part); + + /** + * Assings a value to the source/target side of a word using the + * specified part + * @param str tipically s_str or t_str + * @param part regular expression to match/access + * @param value the string to be assigned + */ + void assign(string &str, ApertiumRE const &part, string const &value); + +public: + /** + * Non-parametric constructor + */ + TransferWord(); + /** + * Destructor + */ + ~TransferWord(); + + /** + * Copy constructor + * @param o the object to be copied + */ + TransferWord(TransferWord const &o); + + /** + * Parametric constructor calling init() + * @param src source word + * @param tgt target word + * @param queue queue lenght + */ + TransferWord(string const &src, string const &tgt, int queue = 0); + + /** + * Assignment operator + * @param o the object to be assigned + * @return reference to left part of assignment + */ + TransferWord & operator =(TransferWord const &o); + + /** + * Sets a bi-word (a source language word and its counterpart in target + * language + * @param src source word + * @param tgt target word + */ + void init(string const &src, string const &tgt); + + /** + * Reference a source language word part + * @param part regular expression to match + * @param with_queue access taking into account the queue + * @returns reference to the part of string matched + */ + string source(ApertiumRE const &part, bool with_queue = true); + + /** + * Reference a target language word part + * @param part regular expression to match + * @param with_queue access taking into account the queue + * @returns reference to the part of string matched + */ + string target(ApertiumRE const &part, bool with_queue = true); + + /** + * Sets a value for a source language word part + * @param part regular expression to match + * @param value the new value for the given part + * @param with_queue access taking or not into account the queue + */ + void setSource(ApertiumRE const &part, string const &value, + bool with_queue = true); + + /** + * Sets a value for a target language word part + * @param part regular expression to match + * @param value the new value for the given part + * @param with_queue access taking or not into account the queue + */ + void setTarget(ApertiumRE const &part, string const &value, + bool with_queue = true); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/transfer_word_list.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_word_list.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_word_list.cc (revision 69632) @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include + +using namespace Apertium; +void +TransferWordList::copy(TransferWordList const &o) +{ + casefull_set = o.casefull_set; + caseless_set = o.caseless_set; +} + +void +TransferWordList::destroy() +{ +} + +TransferWordList::TransferWordList() +{ +} + +TransferWordList::~TransferWordList() +{ + destroy(); +} + +TransferWordList::TransferWordList(TransferWordList const &o) +{ + copy(o); +} + +TransferWordList & +TransferWordList::operator =(TransferWordList const &o) +{ + if(this != &o) + { + destroy(); + copy(o); + } + return *this; +} + +bool +TransferWordList::search(string const &cad, bool caseless) +{ + if(caseless) + { + return caseless_set.find(cad) != caseless_set.end(); + } + else + { + return casefull_set.find(cad) != casefull_set.end(); + } +} + +void +TransferWordList::addWord(string const &cad) +{ + casefull_set.insert(cad); + caseless_set.insert(cad); +} Index: branches/apertium-tagger/apertium2/apertium/transfer_word_list.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/transfer_word_list.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transfer_word_list.h (revision 69632) @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TRANSFERWORDLIST_ +#define _TRANSFERWORDLIST_ + +#include +#include +#include +#ifdef _MSC_VER +#define strcasecmp _stricmp +#endif + +using namespace std; + +struct ltstr +{ + bool operator()(string const &s1, string const &s2) const + { + return s1 < s2; + } +}; + +struct ltstri +{ + bool operator()(string const &s1, string const &s2) const + { + return strcasecmp(s1.c_str(), s2.c_str()) < 0; + } +}; + +class TransferWordList +{ +private: + set casefull_set; + set caseless_set; + + void copy(TransferWordList const &o); + void destroy(); +public: + TransferWordList(); + ~TransferWordList(); + TransferWordList(TransferWordList const &o); + TransferWordList & operator =(TransferWordList const &o); + + bool search(string const &cad, bool caseless = false); + void addWord(string const &cad); +}; + +#endif Index: branches/apertium-tagger/apertium2/apertium/transferpp.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/transferpp.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transferpp.cc (revision 69632) @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include +#include +#include + +using namespace Apertium; +using namespace std; + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + + if(argc != 3) + { + cerr << "USAGE: " << basename(argv[0]) << " rules_file transfer_file" << endl; + exit(EXIT_FAILURE); + } + + TRXReader myReader; + myReader.read(argv[1]); + myReader.write(argv[2]); + + return EXIT_SUCCESS; +} Index: branches/apertium-tagger/apertium2/apertium/ttag.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/ttag.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/ttag.h (revision 69632) @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _TTAG_ +#define _TTAG_ + +#include + +using namespace std; + +typedef int TTag; + +struct TForbidRule +{ + TTag tagi; + TTag tagj; +}; + +class TEnforceAfterRule +{ +public: + TTag tagi; + vector tagsj; +}; + + + + +#endif Index: branches/apertium-tagger/apertium2/apertium/utf_converter.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/utf_converter.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/utf_converter.cc (revision 69632) @@ -0,0 +1,613 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +#include +#include + +using namespace Apertium; + +#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD +#define UNI_MAX_BMP (UTF32)0x0000FFFF +#define UNI_MAX_UTF16 (UTF32)0x0010FFFF +#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF +#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF +#define UNI_SUR_HIGH_START (UTF32)0xD800 +#define UNI_SUR_HIGH_END (UTF32)0xDBFF +#define UNI_SUR_LOW_START (UTF32)0xDC00 +#define UNI_SUR_LOW_END (UTF32)0xDFFF + +using namespace std; + +namespace UtfConverter +{ + + typedef unsigned int UTF32; /* at least 32 bits */ + typedef unsigned short UTF16; /* at least 16 bits */ + typedef unsigned char UTF8; /* typically 8 bits */ + + /* Some fundamental constants */ + + typedef enum { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ + } ConversionResult; + + typedef enum { + strictConversion = 0, + lenientConversion + } ConversionFlags; + + static const int halfShift = 10; /* used for shifting by 10 bits */ + + static const UTF32 halfBase = 0x0010000UL; + static const UTF32 halfMask = 0x3FFUL; + + + void conversionError() + { + wcerr << L"Error: conversion error" << endl; + exit(EXIT_FAILURE); + } + + /* --------------------------------------------------------------------- */ + + ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_LEGAL_UTF32) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + *sourceStart = source; + *targetStart = target; + return result; + } + + /* --------------------------------------------------------------------- */ + + ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; + } + *target++ = ch; + } + *sourceStart = source; + *targetStart = target; + + return result; + } + + /* --------------------------------------------------------------------- */ + + /* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. + */ + static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 + }; + + /* + * Magic values subtracted from a buffer value during UTF8 conversion. + * This table contains as many values as there might be trailing bytes + * in a UTF-8 sequence. + */ + static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + + /* + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed + * into the first byte, depending on how many bytes follow. There are + * as many entries in this table as there are UTF-8 sequence types. + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. + */ + static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + + /* --------------------------------------------------------------------- */ + + /* The interface converts a whole buffer to avoid function-call overhead. + * Constants have been gathered. Loops & conditionals have been removed as + * much as possible for efficiency, in favor of drop-through switches. + * (See "Note A" at the bottom of the file for equivalent code.) + * If your compiler supports it, the "isLegalUTF8" call can be turned + * into an inline function. + */ + + /* --------------------------------------------------------------------- */ + + ConversionResult ConvertUTF16toUTF8 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; + } + + /* --------------------------------------------------------------------- */ + + /* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * If not calling this from ConvertUTF8to*, then the length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns false. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ + + static bool isLegalUTF8(const UTF8 *source, int length) { + UTF8 a; + const UTF8 *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) return false; + return true; + } + + /* --------------------------------------------------------------------- */ + + /* + * Exported function to return whether a UTF-8 sequence is legal or not. + * This is not used here; it's just exported. + */ + bool isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source]+1; + if (source+length > sourceEnd) { + return false; + } + return isLegalUTF8(source, length); + } + + /* --------------------------------------------------------------------- */ + + ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + *sourceStart = source; + *targetStart = target; + return result; + } + + /* --------------------------------------------------------------------- */ + + ConversionResult ConvertUTF32toUTF8 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + if (flags == strictConversion ) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; + } + + /* --------------------------------------------------------------------- */ + + ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = ch; + } + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; + *target++ = UNI_REPLACEMENT_CHAR; + } + } + *sourceStart = source; + *targetStart = target; + return result; + } + + wstring fromUtf8(string const & utf8string) + { + size_t widesize = utf8string.length(); + if (sizeof(wchar_t) == 2) + { + wstring resultstring; + resultstring.resize(widesize+1, L'\0'); + const UTF8* sourcestart = reinterpret_cast(utf8string.c_str()); + const UTF8* sourceend = sourcestart + widesize; + UTF16* targetstart = reinterpret_cast(&resultstring[0]); + UTF16* targetend = targetstart + widesize; + ConversionResult res = ConvertUTF8toUTF16(&sourcestart, sourceend, &targetstart, targetend, strictConversion); + if (res != conversionOK) + { + conversionError(); + } + *targetstart = 0; + return resultstring.substr(0, wcslen(resultstring.c_str())); + } + else if (sizeof(wchar_t) == 4) + { + wstring resultstring; + resultstring.resize(widesize+1, L'\0'); + const UTF8* sourcestart = reinterpret_cast(utf8string.c_str()); + const UTF8* sourceend = sourcestart + widesize; + UTF32* targetstart = reinterpret_cast(&resultstring[0]); + UTF32* targetend = targetstart + widesize; + ConversionResult res = ConvertUTF8toUTF32(&sourcestart, sourceend, &targetstart, targetend, strictConversion); + if (res != conversionOK) + { + conversionError(); + } + *targetstart = 0; + return resultstring.substr(0,wcslen(resultstring.c_str())); + } + else + { + conversionError(); + } + return L""; + } + + string toUtf8(wstring const &widestring) + { + size_t widesize = widestring.length(); + + if (sizeof(wchar_t) == 2) + { + size_t utf8size = 3 * widesize + 1; + string resultstring; + resultstring.resize(utf8size, '\0'); + const UTF16* sourcestart = reinterpret_cast(widestring.c_str()); + const UTF16* sourceend = sourcestart + widesize; + UTF8* targetstart = reinterpret_cast(&resultstring[0]); + UTF8* targetend = targetstart + utf8size; + ConversionResult res = ConvertUTF16toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); + if (res != conversionOK) + { + conversionError(); + } + *targetstart = 0; + return resultstring.substr(0, strlen(resultstring.c_str())); + } + else if (sizeof(wchar_t) == 4) + { + size_t utf8size = 4 * widesize + 1; + string resultstring; + resultstring.resize(utf8size, '\0'); + const UTF32* sourcestart = reinterpret_cast(widestring.c_str()); + const UTF32* sourceend = reinterpret_cast(widestring.c_str() + widesize); + UTF8* targetstart = reinterpret_cast(&resultstring[0]); + UTF8* targetend = targetstart + utf8size; + ConversionResult res = ConvertUTF32toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); + if (res != conversionOK) + { + conversionError(); + } + *targetstart = 0; + return resultstring.substr(0, strlen(resultstring.c_str())); + } + else + { + conversionError(); + } + return ""; + } +} Index: branches/apertium-tagger/apertium2/apertium/utf_converter.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/utf_converter.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/utf_converter.h (revision 69632) @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _UTFCONVERTER_ +#define _UTFCONVERTER_ + +#include + +using namespace std; + +namespace UtfConverter +{ + wstring fromUtf8(string const &utf8string); + string toUtf8(wstring const &widestring); +} + +#endif Index: branches/apertium-tagger/apertium2/apertium/apertium-unformat.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-unformat.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-unformat.1 (revision 69632) @@ -0,0 +1,45 @@ +.TH apertium 1 2006-03-08 "" "" +.SH NAME +apertium-unformat \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.sf.net\fR. +.SH SYNOPSIS +.B apertium-unformat +[\-f format] [infile [outfile]] +.SH DESCRIPTION +.BR apertium +is the application that extract unformatted text from documents. +.RE +.SH OPTIONS +.PP +.B -f format +Specifies the format of the input and output files which can have +these values: +.RS +\(bu \fItxt\fR \fB(default value)\fR Input and output files are in +text format. +.PP +\(bu \fIhtml\fR Input and output files are in "html" format. This +"html" is the one acceptd by the vast majority of web browsers. +.PP +\(bu \fIrtf\fR Input and output files are in "rtf" format. The +accepted "rtf" is the one generated by \fBMicrosoft WordPad (C)\fR and +\fBMicrosoft Office (C)\fR up to and including \fBOffice-97\fR. +.RE +.PP +.B infile +Input file (stdin by default). +.PP +.B outfile +Output file (stdout by default). +.PP +.SH SEE ALSO +.I apertium\fR(1), +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-deshtml.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-deshtml.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-deshtml.1 (revision 69632) @@ -0,0 +1,48 @@ +.TH apertium-deshtml 1 2006-03-21 "" "" +.SH NAME +apertium-deshtml \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-deshtml +[ \-h ] [ \-i ] [ \-n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-deshtml +is an HTML format processor. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of an HTML document and produces output suitable for +processing with lt-proc. HTML tags and other format information are enclosed in brackets so that lt-proc treats them as whitespace between words. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the word "gener" is analysed: +.TP +echo "gener" | apertium-deshtml | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-desrtf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-deslatex.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-deslatex.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-deslatex.1 (revision 69632) @@ -0,0 +1,50 @@ +.TH apertium-deslatex 1 2012-02-29 "" "" +.SH NAME +apertium-deslatex \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-deslatex +[ \-h ] [ \-i ] [ \-n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-deslatex +This filter preprocess apertium-prelatex output to a deformatted 'XMLish' +LaTeX custom format. The output suitable for +processing with lt-proc. Format information (newlines, tabs, etc.) is enclosed in brackets so that lt-proc treats it as whitespace between words. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the word "gener" is analysed: +.TP +echo "gener" | apertium-deslatex | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-prelatex\fR(1), +.I apertium-postlatex\fR(1), +.I apertium-relatex\fR(1), +.I apertium-postlatex-raw\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Complicated constructions in LaTeX (i.e. custom defined tags) are not (yet) +supported. +.PP +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-desodt.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-desodt.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-desodt.1 (revision 69632) @@ -0,0 +1,48 @@ +.TH apertium-desodt 1 2006-03-21 "" "" +.SH NAME +apertium-desodt \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-desodt +[ \-h ] [ \-i ] [ \-n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-desodt +is an ODT format processor. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of an ODT document and produces output suitable for +processing with lt-proc. ODT tags and other format information are enclosed in brackets so that lt-proc treats them as whitespace between words. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the word "gener" is analysed: +.TP +echo "gener" | apertium-desodt | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-desrtf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-despptx.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-despptx.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-despptx.1 (revision 69632) @@ -0,0 +1,48 @@ +.TH apertium-despptx 1 2006-03-21 "" "" +.SH NAME +apertium-despptx \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-despptx +[ \-h ] [ \-i ] [ \-n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-despptx +is an PPTX format processor. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of an PPTX document and produces output suitable for +processing with lt-proc. PPTX tags and other format information are enclosed in brackets so that lt-proc treats them as whitespace between words. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the word "gener" is analysed: +.TP +echo "gener" | apertium-despptx | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-desrtf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-desrtf.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-desrtf.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-desrtf.1 (revision 69632) @@ -0,0 +1,47 @@ +.TH apertium-desrtf 1 2006-03-21 "" "" +.SH NAME +apertium-desrtf \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium-desrtf +[ \-h ] [ \-i ] [ \-n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-desrtf +is an RTF format processor. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of an RTF document and produces output suitable for +processing with lt-proc. RTF commands and other format information are enclosed in brackets so that lt-proc treats them as whitespace between words. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the input document is analysed: +.TP +cat | apertium-desrtf | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-deshtml\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-destxt.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-destxt.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-destxt.1 (revision 69632) @@ -0,0 +1,46 @@ +.TH apertium-destxt 1 2006-03-21 "" "" +.SH NAME +apertium-destxt \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-destxt +[ \-h ] [ \-i ] [ \-n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-destxt +is an text format processor. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of a text file and produces output suitable for +processing with lt-proc. Format information (newlines, tabs, etc.) is enclosed in brackets so that lt-proc treats it as whitespace between words. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the word "gener" is analysed: +.TP +echo "gener" | apertium-destxt | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-deshtml\fR(1), +.I apertium-desrtf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-deswxml.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-deswxml.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-deswxml.1 (revision 69632) @@ -0,0 +1,48 @@ +.TH apertium-deswxml 1 2006-03-21 "" "" +.SH NAME +apertium-deswxml \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-deswxml +[ \-h ] [ \-i ] [ \-n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-deswxml +is an WXML format processor. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of an WXML document and produces output suitable for +processing with lt-proc. WXML tags and other format information are enclosed in brackets so that lt-proc treats them as whitespace between words. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the word "gener" is analysed: +.TP +echo "gener" | apertium-deswxml | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-desrtf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-desxlsx.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-desxlsx.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-desxlsx.1 (revision 69632) @@ -0,0 +1,48 @@ +.TH apertium-desxlsx 1 2006-03-21 "" "" +.SH NAME +apertium-desxlsx \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-desxlsx +[ \-h ] [ \-i ] [ \-n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-desxlsx +is an XLSX format processor. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of an XLSX document and produces output suitable for +processing with lt-proc. XLSX tags and other format information are enclosed in brackets so that lt-proc treats them as whitespace between words. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the word "gener" is analysed: +.TP +echo "gener" | apertium-desxlsx | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-desrtf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-pretransfer.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-pretransfer.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-pretransfer.1 (revision 69632) @@ -0,0 +1,40 @@ +.TH apertium-pretransfer 1 2006-03-21 "" "" +.SH NAME +apertium-pretransfer \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-pretransfer +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-pretransfer +module applies some changes to multiwords (such as moving the lemma queue of +a multiword with inner inflection just after the lemma head). If +the input is not a multiword, it does not affect the output. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the expression "trobant-lo a faltar" is analysed: +.TP +echo "trobant-lo a faltar" | apertium-destxt | lt-proc ca-es.automorf.bin |./ca-es.tagger \-\-tagger ca-es | apertium-pretransfer +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-transfer\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-tagger-apply-new-rules.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-tagger-apply-new-rules.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-tagger-apply-new-rules.1 (revision 69632) @@ -0,0 +1,40 @@ +.TH apertium-tagger-apply-new-rules 1 2007-03-24 "" "" +.SH NAME +apertium-tagger-apply-new-rules \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-tagger-apply-new-rules +\-\-filein [ ] \-\-fileout [ ] \-\-tsxfile [ ] + +.PP +.SH DESCRIPTION +.BR apertium-tagger-apply-new-rules +is used to forbid and enforce rules which are applied to the given HMM parameters. + +Note that the TSX file provided with \-\-tsxfile *must* be equal, in terms of label definitions, to the one used when training the HMM parameters that are to be modified. + +.SH OPTIONS +.TP +.B \-i, \-\-filein +Specify the file with the HMM parameter to process +.TP +.B \-o, \-\-fileout +To specify the file to which the HMM will be written +.TP +.B \-x, \-\-tsxfile +File containing the rules to apply +.PP +.SH SEE ALSO +.I apertium-tagger\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005 -- 2007, Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-tagger.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-tagger.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-tagger.1 (revision 69632) @@ -0,0 +1,103 @@ +.TH apertium-tagger 1 2006-08-30 "" "" +.SH NAME +apertium-tagger \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +architecture: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-tagger \-\-train|\-t +{n} DIC CRP TSX PROB [\-\-debug|\-d]\fR +.PP +.B apertium-tagger \-\-supervised|\-s +{n} DIC CRP TSX PROB HTAG UNTAG [\-\-debug|\-d]\fR +.PP +.B apertium-tagger \-\-retrain|\-r +{n} CRP PROB [\-\-debug|\-d] \fR +.PP +.B apertium-tagger \-\-tagger|\-g +[\-\-first|\-f] PROB [\-\-debug|\-d] [INPUT [OUTPUT]] \fR +.PP +.SH DESCRIPTION +.BR apertium-tagger +is the application responsible for the apertium part-of-speech tagger +training or tagging, depending on the calling options. This command +only reads from the standard input if the option \fB\-\-tagger\fR or +\fB\-g\fR is used. +.SH OPTIONS +.TP +.B \-t {n}, \-\-train {n} +Initializes parameters through Kupiec's method (unsupervised), +then performs \fBn\fR iterations of the Baum-Welch training algorithm +(unsupervised). +.TP +.B \-s {n}, \-\-supervised {n} +Initializes parameters against a hand-tagged text (supervised) through +the maximum likelihood estimate method, then performs \fBn\fR +iterations of the Baum-Welch training algorithm (unsupervised) +.TP +.B \-r {n}, \-\-retrain {n} +Retrains the model with \fBn\fR additional Baum-Welch iterations +(unsupervised). +.TP +.B \-g, \-\-tagger +Tags input text by means of Viterbi algorithm. +.TP +.B \-p, \-\-show\-superficial +Prints the superficial form of the word along side the lexical form +in the output stream. +.TP +.B \-f, \-\-first +Used in conjuntion with \-g (\-\-tagger) makes the tagger +give all lexical forms of each word, with the chosen +one in the first place (after the lemma) +.TP +.B \-d, \-\-debug +Print error (if any) or debug messages while operating. +.TP +.B \-m, \-\-mark +Mark disambiguated words. +.TP +.B \-h, \-\-help +Display a help message. +.SH FILES +These are the kinds of files used with each option: +.PP +.B DIC +Full expanded dictionary file +.PP +.B CRP +Training text corpus file +.PP +.B TSX +Tagger specification file, in XML format +.PP +.B PROB +Tagger data file, built in the training and used while tagging +.PP +.B HTAG +Hand-tagged text corpus +.PP +.B UNTAG +Untagged text corpus, morphological analysis of HTAG corpus to use +both jointly with \-s option +.PP +.B INPUT +Input file, stdin by default +.PP +.B OUTPUT +Output file, stdout by default +.PP +.SH SEE ALSO +.I lt-proc\fR(1), +.I lt-comp\fR(1), +.I lt-expand\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/tmx_aligner_tool.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_aligner_tool.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_aligner_tool.cc (revision 69632) @@ -0,0 +1,744 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#include + +namespace TMXAligner +{ + +extern std::string hunglishDictionaryHome; +extern std::string hunglishExperimentsHome; + +void readTrailOrBisentenceList( std::istream& is, Trail& trail ) +{ + trail.clear(); + while ( is.peek() != -1 ) + { + int huPos, enPos; + + is >> huPos; + if (is.peek()!=' ') + { + std::cerr << "no space in line" << std::endl; + throw "data error"; + } + is.ignore(); + + is >> enPos; + if (is.peek()!='\n') + { + std::cerr << "too much data in line" << std::endl; + throw "data error"; + } + is.ignore(); + + trail.push_back(std::make_pair(huPos,enPos)); + } +} + +void scoreBisentenceListByFile( const BisentenceList& bisentenceList, const std::string& handAlignFile ) +{ + Trail trailHand; + std::ifstream is( handAlignFile.c_str() ); + readTrailOrBisentenceList( is, trailHand ); + + scoreBisentenceList( bisentenceList, trailHand ); +} + +void scoreTrailByFile( const Trail& bestTrail, const std::string& handAlignFile ) +{ + Trail trailHand; + std::ifstream is( handAlignFile.c_str() ); + readTrailOrBisentenceList( is, trailHand ); + + scoreTrail( bestTrail, trailHand ); +} + +// TEMP TEMP +void logLexiconCoverageOfBicorpus( SentenceList& huSentenceList, SentenceList& enSentenceList, + const DictionaryItems& dictionaryItems ); + + +// The

scores should not be counted. This causes some complications. +// Otherwise, this is just the average score of segments. +// Currently this does not like segment lengths of more than two. +double globalScoreOfTrail( const Trail& trail, const AlignMatrix& dynMatrix, + const SentenceList& huSentenceListGarbled, const SentenceList& enSentenceListGarbled ) +{ + TrailScoresInterval trailScoresInterval( trail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); + + return trailScoresInterval(0,trail.size()-1); +} + + +void collectBisentences( const Trail& bestTrail, const AlignMatrix& dynMatrix, + const SentenceList& huSentenceListPretty, const SentenceList& enSentenceListPretty, + SentenceList& huBisentences, SentenceList& enBisentences, + double qualityThreshold ) +{ + huBisentences.clear(); + enBisentences.clear(); + + BisentenceList bisentenceList; + + TrailScores trailScores( bestTrail, dynMatrix ); + trailToBisentenceList( bestTrail, trailScores, qualityThreshold, bisentenceList ); + + for (size_t i=0; ienBookSize ? huBookSize : enBookSize ) / thicknessRatio ) ; + + thickness = ( thickness>minimalThickness ? thickness : minimalThickness ) ; + + if (thickness>maximalThickness) + { +// std::cerr << "WARNING: Downgrading planned thickness " << thickness << " to " << maximalThickness ; +// std::cerr << " to obey memory constraint of " << quasiglobal_maximalSizeInMegabytes << " megabytes " << std::endl; +// std::cerr << "You should recompile if you have much more physical RAM than that. People of the near-future, forgive me for the inconvenience." << std::endl; + + thickness = maximalThickness; + } + + AlignMatrix similarityMatrix( huBookSize, enBookSize, thickness, outsideOfRadiusValue ); + + sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix ); +// std::cerr << std::endl; +// std::cerr << "Rough translation-based similarity matrix ready." << std::endl; + + Trail bestTrail; + AlignMatrix dynMatrix( huBookSize+1, enBookSize+1, thickness, 1e30 ); + + align( similarityMatrix, huLength, enLength, bestTrail, dynMatrix ); +// std::cerr << "Align ready." << std::endl; + + double globalQuality; + globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, + huSentenceListGarbled, enSentenceListGarbled ); + + // std::cerr << "Global quality of unfiltered align " << globalQuality << std::endl; + + if (alignParameters.realignType==AlignParameters::NoRealign) + { + } + else + { + AlignMatrix similarityMatrixDetailed( huBookSize, enBookSize, thickness, outsideOfRadiusValue ); + + bool success = borderDetailedAlignMatrix( similarityMatrixDetailed, bestTrail, 5/*radius*/ ); + + if (!success) + { +// std::cerr << "Realign zone too close to quasidiagonal border. Abandoning realign. The align itself is suspicious." << std::endl; + } + else + { +// std::cerr << "Border of realign zone determined." << std::endl; + + switch (alignParameters.realignType) + { + case AlignParameters::ModelOneRealign: + { + IBMModelOne modelOne; + + SentenceList huBisentences,enBisentences; + + throw "unimplemented"; +// std::cerr << "Plausible bisentences filtered." << std::endl; + + modelOne.build(huBisentences,enBisentences); +// std::cerr << "IBM Model I ready." << std::endl; + + sentenceListsToAlignMatrixIBMModelOne( huSentenceListPretty, enSentenceList, modelOne, similarityMatrixDetailed ); +// std::cerr << "IBM Model I based similarity matrix ready." << std::endl; + break; + } + case AlignParameters::FineTranslationRealign: + { + TransLex transLex; + transLex.build(dictionary); +// std::cerr << "Hashtable for dictionary ready." << std::endl; + + sentenceListsToAlignMatrixTranslation( huSentenceListPretty, enSentenceList, transLex, similarityMatrixDetailed ); + +// std::cerr << "Fine translation-based similarity matrix ready." << std::endl; + break; + } + + case AlignParameters::NoRealign: + default: + { + break; + } + } + + Trail bestTrailDetailed; + AlignMatrix dynMatrixDetailed( huBookSize+1, enBookSize+1, thickness, 1e30 ); + align( similarityMatrixDetailed, huLength, enLength, bestTrailDetailed, dynMatrixDetailed ); +// std::cerr << "Detail realign ready." << std::endl; + + bestTrail = bestTrailDetailed; + dynMatrix = dynMatrixDetailed; + + globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, + huSentenceListGarbled, enSentenceListGarbled ); + + // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; + } + } + + TrailScoresInterval trailScoresInterval( bestTrail, dynMatrix, huSentenceListGarbled, enSentenceListGarbled ); + + if ( alignParameters.postprocessTrailQualityThreshold != -1 ) + { + postprocessTrail( bestTrail, trailScoresInterval, alignParameters.postprocessTrailQualityThreshold ); +// std::cerr << "Trail start and end postprocessed by score." << std::endl; + } + + if ( alignParameters.postprocessTrailStartAndEndQualityThreshold != -1 ) + { + postprocessTrailStartAndEnd( bestTrail, trailScoresInterval, alignParameters.postprocessTrailStartAndEndQualityThreshold ); +// std::cerr << "Trail start and end postprocessed by score." << std::endl; + } + + if ( alignParameters.postprocessTrailByTopologyQualityThreshold != -1 ) + { + postprocessTrailByTopology( bestTrail, alignParameters.postprocessTrailByTopologyQualityThreshold ); +// std::cerr << "Trail postprocessed by topology." << std::endl; + } + + bool quasiglobal_spaceOutBySentenceLength = true; +// std::cerr << "quasiglobal_spaceOutBySentenceLength is set to " << quasiglobal_spaceOutBySentenceLength << std::endl; + if (quasiglobal_spaceOutBySentenceLength) + { + spaceOutBySentenceLength( bestTrail, huSentenceListPretty, enSentenceList, alignParameters.utfCharCountingMode ); +// std::cerr << "Trail spaced out by sentence length." << std::endl; + } + + // In cautious mode, auto-aligned rundles are thrown away if + // their left or right neighbour holes are not one-to-one. + if (alignParameters.cautiousMode) + { + cautiouslyFilterTrail( bestTrail ); +// std::cerr << "Trail filtered by topology." << std::endl; + } + + globalQuality = globalScoreOfTrail( bestTrail, dynMatrix, + huSentenceListGarbled, enSentenceListGarbled ); + + // std::cerr << "Global quality of unfiltered align after realign " << globalQuality << std::endl; + + bool textual = ! alignParameters.justSentenceIds ; + + if (alignParameters.justBisentences) + { + BisentenceList bisentenceList; + trailToBisentenceList( bestTrail, bisentenceList ); + + filterBisentenceListByQuality( bisentenceList, dynMatrix, alignParameters.qualityThreshold ); + + BisentenceListScores bisentenceListScores(bisentenceList, dynMatrix); + + for ( size_t i=0; i remains; + args.read( argC, argV, remains ); + + AlignParameters alignParameters; + + if (args.getSwitchCompact("text")) + { + alignParameters.justSentenceIds = false; + } + + if (args.getSwitchCompact("bisent")) + { + alignParameters.justBisentences = true; + } + + if (args.getSwitchCompact("cautious")) + { + alignParameters.cautiousMode = true; + } + + alignParameters.utfCharCountingMode = args.getSwitchCompact("utf"); + + fillPercentParameter( args, "thresh", alignParameters.qualityThreshold ); + + fillPercentParameter( args, "ppthresh", alignParameters.postprocessTrailQualityThreshold ); + + fillPercentParameter( args, "headerthresh", alignParameters.postprocessTrailStartAndEndQualityThreshold ); + + fillPercentParameter( args, "topothresh", alignParameters.postprocessTrailByTopologyQualityThreshold ); + + bool batchMode = args.getSwitchCompact("batch") ; + + if (batchMode && (remains.size()!=2) ) + { + std::cerr << "Batch mode requires exactly two file arguments." << std::endl; + std::cerr << std::endl; + + main_alignerToolUsage(); + throw "argument error"; + } + + std::string handArgumentname = "hand"; + if (args.find(handArgumentname)!=args.end()) + { + if (batchMode) + { + std::cerr << "-batch and -" << handArgumentname << " are incompatible switches." << std::endl; + throw "argument error"; + } + else + { + alignParameters.handAlignFilename = args[handArgumentname].dString ; + args.erase(handArgumentname); + + if (alignParameters.handAlignFilename.empty()) + { + std::cerr << "-" << handArgumentname << " switch requires a filename value." << std::endl; + throw "argument error"; + } + } + } + + std::string autoDictDumpArgumentname = "autodict"; + if (args.find(autoDictDumpArgumentname)!=args.end()) + { + if (batchMode) + { + std::cerr << "-batch and -" << autoDictDumpArgumentname << " are incompatible switches." << std::endl; + throw "argument error"; + } + else + { + alignParameters.autoDictionaryDumpFilename = args[autoDictDumpArgumentname].dString ; + args.erase(autoDictDumpArgumentname); + + if (alignParameters.autoDictionaryDumpFilename.empty()) + { + std::cerr << "-" << autoDictDumpArgumentname << " switch requires a filename value." << std::endl; + throw "argument error"; + } + } + } + + if (!batchMode && (remains.size()!=3) ) + { + std::cerr << "Nonbatch mode requires exactly three file arguments." << std::endl; + std::cerr << std::endl; + + main_alignerToolUsage(); + throw "argument error"; + } + + try + { + args.checkEmptyArgs(); + } + catch (...) + { + std::cerr << std::endl; + + main_alignerToolUsage(); + throw "argument error"; + } + +// std::cerr << "Reading dictionary..." << std::endl; + const char* dicFilename = remains[0] ; + DictionaryItems dictionary; + std::ifstream dis(dicFilename); + dictionary.read(dis); + + if (batchMode) + { + const char* batchFilename = remains[1] ; + std::ifstream bis(batchFilename); + + while (bis.good()&&!bis.eof()) + { + std::string line; + std::getline(bis,line); + + std::vector words; + split( line, words, '\t' ); + + if (words.size()!=3) + { + std::cerr << "Batch file has incorrect format." << std::endl; + throw "data error"; + } + + std::string huFilename, enFilename, outFilename; + huFilename = words[0]; + enFilename = words[1]; + outFilename = words[2]; + +// std::cerr << "Processing " << outFilename << std::endl; + bool failed = false; + try + { + alignerToolWithFilenames( dictionary, huFilename, enFilename, alignParameters, outFilename ); + } + catch ( const char* errorType ) + { + std::cerr << errorType << std::endl; + failed = true; + } + catch ( std::exception& e ) + { + std::cerr << "some failed assertion:" << e.what() << std::endl; + failed = true; + } + catch ( ... ) + { + std::cerr << "some unknown failed assertion..." << std::endl; + failed = true; + } + + if (failed) + { + std::cerr << "Align failed for " << outFilename << std::endl; + } + } + } + else + { + const char* huFilename = remains[1] ; + const char* enFilename = remains[2] ; + + alignerToolWithFilenames( dictionary, huFilename, enFilename, alignParameters ); + } + } +#ifndef _DEBUG + catch ( const char* errorType ) + { + std::cerr << errorType << std::endl; + return -1; + } + catch ( std::exception& e ) + { + std::cerr << "some failed assertion:" << e.what() << std::endl; + return -1; + } + catch ( ... ) + { + std::cerr << "some unknown failed assertion..." << std::endl; + return -1; + } +#endif + return 0; +} + +} Index: branches/apertium-tagger/apertium2/apertium/apertium-desrtf-cp1250.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-desrtf-cp1250.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-desrtf-cp1250.1 (revision 69632) @@ -0,0 +1,47 @@ +.TH apertium-desrtf 1 2006-03-21 "" "" +.SH NAME +apertium-desrtf \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://apertium.sf.net\fR. +.SH SYNOPSIS +.B apertium-desrtf +[ -h ] [ -i ] [ -n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-desrtf +is an RTF format processor. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of an RTF document and produces output suitable for +processing with lt-proc. RTF commands and other format information are enclosed in brackets so that lt-proc treats them as whitespace between words. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the input document is analysed: +.TP +cat | apertium-desrtf | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-deshtml\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-desrtf-cp1251.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-desrtf-cp1251.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-desrtf-cp1251.1 (revision 69632) @@ -0,0 +1,47 @@ +.TH apertium-desrtf 1 2006-03-21 "" "" +.SH NAME +apertium-desrtf \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://apertium.sf.net\fR. +.SH SYNOPSIS +.B apertium-desrtf +[ -h ] [ -i ] [ -n ] +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-desrtf +is an RTF format processor. Data should be passed through this +processor before being piped to lt-proc. The program takes input +in the form of an RTF document and produces output suitable for +processing with lt-proc. RTF commands and other format information are enclosed in brackets so that lt-proc treats them as whitespace between words. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.B \-i +Makes the addition of trailing sentence terminator (".") unconditional, often +leading to duplicates. +.B \-n +Suppresses the addition of a trailing sentence terminator. +.PP +.SH EXAMPLE +.TP +You could write the following to show how the input document is analysed: +.TP +cat | apertium-desrtf | lt-proc ca-es.automorf.bin +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-deshtml\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-postlatex-raw.l =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-postlatex-raw.l (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-postlatex-raw.l (revision 69632) @@ -0,0 +1,306 @@ + + +%{ + + + +#include +#include +#include +#include +#include +#include + +extern "C" { +#if !defined(__STDC__) +# define __STDC__ 1 +#endif +#include +} + +#include +#include +#include +#ifndef GENFORMAT +#include "apertium_config.h" +#endif +#include +#ifdef _WIN32 +#include +#include +#endif + +using namespace std; + +AccentsMap accentsMap(true); +wstring closesym = L""; +string memconv = ""; + +wstring convertir(string const &multibyte, int const length) +{ + memconv.append(multibyte.c_str(), length); + int tam = memconv.size(); + wchar_t *retval = new wchar_t[tam+1]; + size_t l = mbstowcs(retval, memconv.c_str(), tam); + + if(l == ((size_t) -1)) + { + delete[] retval; + if(memconv.size() >= 4) + { + wcerr << L"Warning: wrong encoding" << endl; + } + return L""; + } + else + { + memconv = ""; + retval[l] = 0; + wstring ret = retval; + delete[] retval; + return ret; + } +} + + + + +%} + + +%option nounput +%option noyywrap +%option stack + +%x mathenv +%x readbrackets + +%% + + + +" { + fputws(L"\"",yyout); +} +' { + fputws(L"\'",yyout); +} +< { + fputws(L"<",yyout); +} +> { + fputws(L">",yyout); +} +& { + fputws(L"\\&",yyout); +} +\ { + fputws(L"&",yyout); +} + +\ { + fputws(L"\\{", yyout); +} + +\ { + fputws(L"\\}", yyout); +} + +\ { + fputws(L"\\%", yyout); +} + + + +Âż { + fputws(L"?`",yyout); +} + +ÂĄ { + fputws(L"!`",yyout); +} + + + +\ { + BEGIN(mathenv); + fputws(L"$$",yyout); +} + +\<\/MATH_DOLLARS\> { + fputws(L"$$",yyout); + BEGIN(0); +} + + +\ { + BEGIN(mathenv); + fputws(L"$",yyout); +} + +\<\/MATH_DOLLAR\> { + fputws(L"$",yyout); + BEGIN(0); +} + +\ { + fputws(L"\\(",yyout); +} + +\<\/MATH_PAR\> { + fputws(L"\\)",yyout); +} + +\ { + fputws(L"\\[",yyout); +} + +\<\/MATH_BRA\> { + fputws(L"\\]",yyout); +} + + +\ { + fputws(L"{",yyout); +} + +\<\/CONTENTS\> { + fputws(L"}",yyout); +} + +&NBSP; { + fputws(L"~",yyout); +} + + + +\ { + fputws(L"\\\\",yyout); +} + +\[^\<]* { + fputws((wstring(L"\%")+convertir(yytext+9,yyleng-9)).c_str(),yyout); +} + +\<\/COMMENT\> { +} + + +\[^\<]* { + fputws((wstring(L"[")+convertir(yytext+7,yyleng-7)).c_str(),yyout); +} +\<\/PARAM\> { + fputws(L"]", yyout); +} + +\ { + fputws(L"\\verb", yyout); +} + +\<\/VERB\> { + ; +} + + +\<[a-zA-Z0-9]+\> { + fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-2)+wstring(L"}")).c_str(),yyout); +} + +\<[a-zA-Z0-9]+_STAR\> { + fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-7)+wstring(L"*}")).c_str(),yyout); +} + +\<\/[a-zA-Z0-9]+\> { + fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-3)+wstring(L"}")).c_str(),yyout); +} + +\<\/[a-zA-Z0-9]+_STAR\> { + fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-8)+wstring(L"*}")).c_str(),yyout); +} + +\<[a-zA-Z0-9]+\/\> { + fputws((wstring(L"\\")+convertir(yytext+1,yyleng-3)).c_str(),yyout); +} + +\<[a-zA-Z0-9]+_STAR\/\> { + fputws((wstring(L"\\")+convertir(yytext+1,yyleng-8)+wstring(L"*")).c_str(),yyout); +} + +\# { + fputws(L"\\#", yyout); +} + + +(.|\n) { + fputws(convertir(yytext,yyleng).c_str(),yyout); +} + +(.|\n) { + fputws(convertir(yytext,yyleng).c_str(),yyout); +} + + +<> { + return 0; +} +%% + + + +void usage(string const &progname) +{ + + cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; + + cerr << "LaTeX format postprocessor " << endl; + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + size_t base = 0; + + if(argc >= 2 && !strcmp(argv[1],"-i")) + { + base++; + } + + if((argc-base) > 4) + { + usage(argv[0]); + } + + switch(argc-base) + { + case 3: + yyout = fopen(argv[2+base], "w"); + if(!yyout) + { + usage(argv[0]); + } + case 2: + yyin = fopen(argv[1+base], "r"); + if(!yyin) + { + usage(argv[0]); + } + break; + default: + break; + } + +#ifdef _WIN32 + _setmode(_fileno(yyin), _O_U8TEXT); + _setmode(_fileno(yyout), _O_U8TEXT); +#endif + // prevent warning message + yy_push_state(1); + yy_top_state(); + yy_pop_state(); + + yylex(); + + fclose(yyin); + fclose(yyout); +} Index: branches/apertium-tagger/apertium2/apertium/apertium-postlatex.l =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-postlatex.l (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-postlatex.l (revision 69632) @@ -0,0 +1,364 @@ + + +%{ + + + +#include +#include +#include +#include +#include +#include + +extern "C" { +#if !defined(__STDC__) +# define __STDC__ 1 +#endif +#include +} + +#include +#include +#include +#ifndef GENFORMAT +#include "apertium_config.h" +#endif +#include +#ifdef _WIN32 +#include +#include +#endif + +using namespace std; + +AccentsMap accentsMap(true); +wstring closesym = L""; +string memconv = ""; + +wstring convertir(string const &multibyte, int const length) +{ + memconv.append(multibyte.c_str(), length); + int tam = memconv.size(); + wchar_t *retval = new wchar_t[tam+1]; + size_t l = mbstowcs(retval, memconv.c_str(), tam); + + if(l == ((size_t) -1)) + { + delete[] retval; + if(memconv.size() >= 4) + { + wcerr << L"Warning: wrong encoding" << endl; + } + return L""; + } + else + { + memconv = ""; + retval[l] = 0; + wstring ret = retval; + delete[] retval; + return ret; + } +} + + + + +%} + + +%option nounput +%option noyywrap +%option stack + +%x mathenv +%x readbrackets + +%% + + + +" { + fputws(L"\"",yyout); +} +' { + fputws(L"\'",yyout); +} +< { + fputws(L"<",yyout); +} +> { + fputws(L">",yyout); +} +& { + fputws(L"\\&",yyout); +} +\ { + fputws(L"&",yyout); +} + +\ { + fputws(L"\\{", yyout); +} + +\ { + fputws(L"\\}", yyout); +} + +\ { + fputws(L"\\%", yyout); +} + +Âż { + fputws(L"?`",yyout); +} + +ÂĄ { + fputws(L"!`",yyout); +} + + + +\ { + BEGIN(mathenv); + fputws(L"$$",yyout); +} + +\<\/MATH_DOLLARS\> { + fputws(L"$$",yyout); + BEGIN(0); +} + + +\ { + BEGIN(mathenv); + fputws(L"$",yyout); +} + +\<\/MATH_DOLLAR\> { + fputws(L"$",yyout); + BEGIN(0); +} + +\ { + fputws(L"\\(",yyout); +} + +\<\/MATH_PAR\> { + fputws(L"\\)",yyout); +} + +\ { + fputws(L"\\[",yyout); +} + +\<\/MATH_BRA\> { + fputws(L"\\]",yyout); +} + + +\ { + fputws(L"{",yyout); +} + +\<\/CONTENTS\> { + fputws(L"}",yyout); +} + +&NBSP; { + fputws(L"~",yyout); +} + + + +\ { + fputws(L"\\\\",yyout); +} + +\[^\<]* { + fputws((wstring(L"\%")+convertir(yytext+9,yyleng-9)).c_str(),yyout); +} + +\<\/COMMENT\> { +} + + +\[^\<]* { + fputws((wstring(L"[")+convertir(yytext+7,yyleng-7)).c_str(),yyout); +} +\<\/PARAM\> { + fputws(L"]", yyout); +} + +\ { + fputws(L"\\verb", yyout); +} + +\<\/VERB\> { + ; +} + + + +Ƃ { + fputws(L"\\l", yyout); +} + + +Ɠ { + fputws(L"{\\oe}",yyout); +} + +ƒ { + fputws(L"{\\OE}",yyout); +} + +ĂŠ { + fputws(L"{\\ae}",yyout); +} + +Æ { + fputws(L"{\\AE}",yyout); +} + +Ă„ { + fputws(L"{\\aa}",yyout); +} + +Å { + fputws(L"{\\AA}",yyout); +} + +Ăž { + fputws(L"{\\o}",yyout); +} + +Ø { + fputws(L"{\\O}",yyout); +} + +ß { + fputws(L"{\\ss}",yyout); +} + +\<[a-zA-Z0-9]+\> { + fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-2)+wstring(L"}")).c_str(),yyout); +} + +\ { + fputws((wstring(L"\\#")+convertir(yytext+6,yyleng-8)).c_str(),yyout); +} + +\ { + fputws(L"\\#", yyout); +} + +\<[a-zA-Z0-9]+_STAR\> { + fputws((wstring(L"\\begin{")+convertir(yytext+1,yyleng-7)+wstring(L"*}")).c_str(),yyout); +} + +\<\/[a-zA-Z0-9]+\> { + fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-3)+wstring(L"}")).c_str(),yyout); +} + +\<\/[a-zA-Z0-9]+_STAR\> { + fputws((wstring(L"\\end{")+convertir(yytext+2,yyleng-8)+wstring(L"*}")).c_str(),yyout); +} + +\<[a-zA-Z0-9]+\/\> { + fputws((wstring(L"\\")+convertir(yytext+1,yyleng-3)).c_str(),yyout); +} + +\<[a-zA-Z0-9]+_STAR\/\> { + fputws((wstring(L"\\")+convertir(yytext+1,yyleng-8)+wstring(L"*")).c_str(),yyout); +} + /*NO ENTIENDO ESTA REGLA + \# { + fputws(L"\\#", yyout); + }*/ + + +[^A-Za-z\n] { + wstring wt = convertir(yytext,yyleng); + wstring wa = accentsMap.get(wt); + if( wa == L"" ) + fputws(wt.c_str(),yyout); + else + fputws(wstring(L"\\"+wa.substr(0,1)+L"{"+wa.substr(1)+L"}").c_str(),yyout); +} + + +(.|\n) { + fputws(convertir(yytext,yyleng).c_str(),yyout); +} + +(.|\n) { + fputws(convertir(yytext,yyleng).c_str(),yyout); +} + + +<> { + return 0; +} +%% + + + +void usage(string const &progname) +{ + + cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; + + cerr << "LaTeX format postprocessor " << endl; + exit(EXIT_SUCCESS); +} + +int main(int argc, char *argv[]) +{ + LtLocale::tryToSetLocale(); + size_t base = 0; + + if(argc >= 2 && !strcmp(argv[1],"-i")) + { + base++; + } + + if((argc-base) > 4) + { + usage(argv[0]); + } + + switch(argc-base) + { + case 3: + yyout = fopen(argv[2+base], "w"); + if(!yyout) + { + usage(argv[0]); + } + case 2: + yyin = fopen(argv[1+base], "r"); + if(!yyin) + { + usage(argv[0]); + } + break; + default: + break; + } + +#ifdef _WIN32 + _setmode(_fileno(yyin), _O_U8TEXT); + _setmode(_fileno(yyout), _O_U8TEXT); +#endif + // prevent warning message + yy_push_state(1); + yy_top_state(); + yy_pop_state(); + + yylex(); + + fclose(yyin); + fclose(yyout); +} Index: branches/apertium-tagger/apertium2/apertium/xsd/transfer.xsd =================================================================== --- branches/apertium-tagger/apertium2/apertium/xsd/transfer.xsd (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/xsd/transfer.xsd (revision 69632) @@ -0,0 +1,1049 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 'transfer' is the root element containing the whole structural + transfer rule file. Attribute 'default' specifies if + unmatched words have to be written as lexical units ("lu", this is + the default value) or as chunks ("chunk"). + + + + + + + + + + + + + + + + + + + + + + + + + + The 'def-cats' section defines the categories used to build the + patterns used in rules + + + + + + + + + + + + + + + + + Each 'def-cat' defines one category in terms of a list of + category items and has a unique name 'n', which is mandatory + + + + + + + + + + + + + + + Each 'cat-item' (category item) represents a set of lexical forms + and has a mandatory attribute 'tags' whose value is a sequence of + dot-separated tag names; this sequence is a subsequence of the + tag sequence defining each possible lexical form. For example, + tags="n.f" would match all lexical forms containing this tag + sequence, such as "^casa<n><f><pl>$". + + In addition, an optional attribute, "lemma", may be used to + define lexical forms having a particular substring in their lemma + + + + + + + + + + + + + The 'def-attrs' section defines the attributes that will be + identified in matched lexical forms + + + + + + + + + + + + + + + + + Each def-attr defines one attribute in terms of a list of + attribute items and has a mandatory unique name n + + + + + + + + + + + + + + + Each 'attr-item' specifies a subsequence of the tags in + that lexical form (attribute 'tags') + + + + + + + + + + + + The 'def-vars' section defines the global variables + that will be used to transfer information between rules + + + + + + + + + + + + + + + + + The definition of a global variable has a mandatory unique name 'n' that + will be used to refer to it. A value of initialization can also be specified + by means the 'v' attribute. The default value of the initialization is the + empty string. + + + + + + + + + + + + + Element 'section-def-lists' encloses a set of list definitions + + + + + + + + + + + + + + + + + The 'def-list' element defines a named list to search with the 'in' + element. Attribute 'n' sets the name of the list + + + + + + + + + + + + + + + Attribute 'v' of 'list-item' element contains the value to be added to + the list being defined + + + + + + + + + + + + The 'def-macros' section defines macros containing portions of + code frequently used in the action part of rules + + + + + + + + + + + + + + + + + Macro definition: + + A macro has a mandatory name (the value of 'n'), a number of parameters + (the value of 'npar') and a body containing arguments and statements. + + + + + + + + + + + + + + + + The rules section contains a sequence of one or more rules + + + + + + + + + + + + + Each rule has a pattern and an action + * attribute 'comment' allows to put in comments about the purpose of + the rule being defined + + + + + + + + + + + + + + + The pattern is specified in terms of pattern items, each one + representing a lexical form in the matched pattern + + + + + + + + + + + + + Each attribute to be activated is referred to by its name in the def-cats section + + + + + + + + + + + Encloses the procedural part of a rule + + + + + + + + + + + + + + The choose statement is a selection statement (similar to a case + statement) composed of one or more tested cases and an optional + otherwise + + + + + + + + + + + + + + + Each tested case is a block of zero or more statements + + + + + + + + + + + + + + + + + The otherwise case is also a block of one or more statements + + + + + + + + + + + + + + The test in a tested case may be a conjunction, a disjunction, or + a negation of simpler tests, as well as a simple equality test + + + + + + + + + + + + Each conjuntion test contains two or more simpler tests + + + + + + + + + + + + + + Each disjunction test contains two or more simpler tests + + + + + + + + + + + + + + The negation of a simpler test is a test itself + + + + + + + + + + + The simplest test is an equality test. The right part and the + left part of the equality may both be a clip (see below), a + literal string ('lit'), a literal tag ('lit-tag') or the value of + a variable ('var') defined in the def-vars section. When the attribute + 'caseless' is set to 'yes', the comparison is made without attending + to the case. + + + + + + + + + + + + + + + + + + + + + + Tests if the left part contains the right part at the beginning. + Both parts of the test may both be a clip (see below), a + literal string ('lit'), a literal tag ('lit-tag') or the value of + a variable ('var') defined in the def-vars section. When the attribute + 'caseless' is set to 'yes', the comparison is made without attending + to the case. + + + + + + + + + + + + + + + + + + + + + + Tests if the left part contains the right part at the end. + Both parts of the test may both be a clip (see below), a + literal string ('lit'), a literal tag ('lit-tag') or the value of + a variable ('var') defined in the def-vars section. When the attribute + 'caseless' is set to 'yes', the comparison is made without attending + to the case. + + + + + + + + + + + + + + + + + + + + + + Tests if the left part contains the right part at the beginning. + First parts of the test may be a clip (see below), a + literal string ('lit'), a literal tag ('lit-tag') or the value of + a variable ('var') defined in the def-vars section. The second part + must be always a list. When the attribute + 'caseless' is set to 'yes', the comparison is made without attending + to the case. + + + + + + + + + + + + + + + + + + + + + + Tests if the left part contains the right part at the end. + First parts of the test may be a clip (see below), a + literal string ('lit'), a literal tag ('lit-tag') or the value of + a variable ('var') defined in the def-vars section. The second part + must be always a list. When the attribute + 'caseless' is set to 'yes', the comparison is made without attending + to the case. + + + + + + + + + + + + + + + + + + + + + + Tests if the left part contains the right part. + Both parts of the test may both be a clip (see below), a + literal string ('lit'), a literal tag ('lit-tag') or the value of + a variable ('var') defined in the def-vars section. When the attribute + 'caseless' is set to 'yes', the comparison is made without attending + to the case. + + + + + + + + + + + + + + + + + + + + + + 'in' performs a search of a value in a list. If 'caseless' is set to yes, + this search is performed without attending to the case + + + + + + + + + + + + + + + + + + + + + + 'list' refers, with the name in attribute 'n', a list defined before in + the 'section-def-list' section + + + + + + + + + + + An assignment statement ('let') assigns the value of a clip (see + below), a literal string ('lit'), a literal tag('lit-tag') or the + value of a global variable ('var') to either a global variable ('var') + or a clip + + + + + + + + + + + + + + This instruction appends the value of a clip (see + below), a literal string ('lit'), a literal tag('lit-tag') or the + value of a global variable ('var') to either a global variable ('var') + or a clip, identified by the "n" attribute + + + + + + + + + + + + + + 'out' is an output statement; it may output any sequence of + clips, literal strings, literal tags, variables, and whitespace items + (see below) + + + + + + + + + + + + + + The first argument of 'modify-case' copy the case of the second + argument. + + + + + + + + + + + + + + A macro may be called anywhere by name with one or more + arguments + + + + + + + + + + + + + + The attribute pos in each argument is used to refer to a lexical + form in the current rule. For example, if a 2-parameter macro + has been defined to perform noun-adjective agreement operations, + it may be used with arguments 1 and 2 in a noun-adjective rule, + with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with + arguments 1 and 3 in a noun-adverb-adjective rule, and with + arguments 2 and 1 in an adjective-noun rule + + + + + + + + + + + A 'clip' is a substring of a source-language or target-language + lexical form, extracted according to an attribute: + + * 'pos' is an index (1, 2, 3...) used to select a lexical form + inside the rule; + + * 'side' is used to select a source-language ('sl') or a + target-language ('tl') clip + + * the value of 'part' is the name of an attribute defined in + def-attrs, but may take also the values 'lem' (referring to + the lemma of the lexical form), 'lemh' (lemma head), 'lemq' + (lemma queue) and 'whole' (referring to the whole lexical form). + + * the value of 'queue' may be 'no' or 'yes'. 'yes' is assumed by + default. + + * 'link-to' causes the other attributes to be ignored in clip evaluation + when using 'clip' as a right hand side element (as value), and + returns its value. When using as a left hand side (as reference), + the value of the 'as' attribute is ignored. + + + + + + + + + + + + + + + + + + + + + + + A literal string value: the value of the literal is the value of + the 'v' attribute + + + + + + + + + + + A literal string value: the value of the literal is the value of + the 'v' attribute + + + + + + + + + + + Each 'var' is a variable identifier: the attribute n is the name + of the variable. When it is in an 'out', a 'test', or the right + part of a 'let', it represents the value of the variable; when in + the left part of a 'let' it represents the reference of the + variable. + + + + + + + + + + + TODO: + + + + + + + + + + + + + + + + A 'case-of' is a value representing the case of a "clip". This value + will be "aa" (all lowercase), "Aa" (first uppercase) and "AA", + (all uppercase). + + * 'pos' is an index (1, 2, 3...) used to select a lexical form + inside the rule; + + * 'side' is used to select a source-language ('sl') or a + target-language ('tl') clip + + * the value of 'part' is the name of an attribute defined in + def-attrs, but may take also the values 'lem' (referring to + the lemma of the lexical form), 'lemh' (lemma head), 'lemq' + (lemma queue) and 'whole' (referring to the whole lexical form). + + + + + + + + + + + + + + + + + + + + Concatenates a sequence of values. + + + + + + + + + + + + + Encloses a multiword. + + + + + + + + + + + + + Encloses a word inside an 'out' element. + + + + + + + + + + + + + Encloses a chunk inside an 'out' element. + * 'name' the pseudolemma of the chunk. + * 'namefrom' get the name from a variable. + * 'case' the variable to get the uppercase/lowercase policy + to apply it to the chunk name + + + + + + + + + + + + + + + + + + + + + + + A sequence of tags for a lexical unit. + + + + + + + + + + + + + A lexical unit tag. + + + + + + + + + + + + + 'b' is a [super]blanks item, indexed by pos; for example, a 'b' + with pos="2" refers to the [super]blanks (including format data + encapsulated by the de-formatter) between lexical form 2 and + lexical form 3. Managing [super]blanks explicitly allows for the + correct placement of format when the result of structural + transfer has more or less lexical items than the original or has + been reordered in some way. If attribute "pos" is not specified, then + a single blank (ASCII 32) is generated. + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/tmx_aligner_tool.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_aligner_tool.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_aligner_tool.h (revision 69632) @@ -0,0 +1,41 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef _ALIGNER_TOOL_H_ +#define _ALIGNER_TOOL_C_ + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + + +#include +#include + +#include + +namespace TMXAligner{ + +void alignerToolWithFilenames(const DictionaryItems& dictionary, + const std::string& huFilename, + const std::string& enFilename, + const AlignParameters& alignParameters, + const std::string& outputFilename = "" ); +} +#endif Property changes on: branches/apertium-tagger/apertium2/apertium/tmx_aligner_tool.h ___________________________________________________________________ Added: svn:mergeinfo ## -0,0 +0,0 ## Index: branches/apertium-tagger/apertium2/apertium/tmx_alignment.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_alignment.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_alignment.h (revision 69632) @@ -0,0 +1,115 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __TMXALIGNER_ALIGNMENT_ALIGNMENT_H +#define __TMXALIGNER_ALIGNMENT_ALIGNMENT_H + +#include + +#include +#include + +namespace TMXAligner +{ + +// Simply double values for each sentence. Right now we store sentence lengths in them. +typedef std::vector SentenceValues; + +// See quasiDiagonal.h +typedef QuasiDiagonal AlignMatrix; + +// Contains directions, a bit like a force field. +typedef QuasiDiagonal TrelliMatrix; + +// A Rundle (x,y) cuts the bitext into two sub-bitexts: +// [0,x)+[0,y) and [x,huSize)+[y,enSize). +typedef std::pair Rundle; + +// A Trail is a strictly ordered list of Rundles. +// It cuts the bitext into small bitexts. +// Such a small bitext is called a hole or segmentum. +// A hole can contion zero Hungarian sentence, +// it can contain zero English sentences, but not both. +// A Trail is sometimes referred to as a Ladder. +typedef std::vector Trail; + +// A BisentenceList is formally identical to a Trail, but semantically very different. +// It represents an ordered list of bisentences. +// There are some functions which utilize the formal identity, +// manipulating both structures. +typedef std::vector< std::pair > BisentenceList; + +// OBSOLETE: +// TrailValues gives scores to the Rundles of a Trail (of the same size). +// Conceptually TrailValues should be attached to Trails. +// A TrailValues structure always accompanies a Trails list, +// but their consistency must be maintained by hand, pre-OO-style. (TODO) +// typedef std::vector TrailValues; + +// OBSOLETE: +// Has the exactly same relation to BisentenceList as +// a TrailValues has to a Trail. But note that these +// scores mark the confidence in a bisentence. This is +// very different from the confidence in a rundle. +// typedef std::vector BisentenceValues; + +double closeness( double twoSentenceLength, double oneSentenceLength ); + +const double skipScore = -0.3; + + +// The main align function, +// Gets a confidence value for every sentence-pair, +// and sentence lengths for each sentence (for a a Gale-Church-like scoring). +// Returns a trail with the best total score, and the computed dynMatrix matrix: +// dynMatrix[huPos][enPos] gives the similarity of the [0,huPos) and [0,enPos) intervals. +void align( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength, + Trail& bestTrail, AlignMatrix& dynMatrix ); + + +bool oneToOne( const Trail& bestTrail, int pos ); + +// Collect bisentences. +void trailToBisentenceList( const Trail& bestTrail, + BisentenceList& bisentenceList ); + +// Score precision-recall of a BisentenceList according to a hand-aligned bicorpus. +// For best results, zero-to-many holes of the hand-alignment should be subdivided to zero-to-ones. +// Builds the manual bisentencelist. The compared sets consist of Bisentences. +double scoreBisentenceList( const BisentenceList& bisentenceList, const Trail& trailHand ); + +// The same precision-recall calculation for Trails. The compared sets consist of Rundles. +double scoreTrail ( const Trail& trailAuto, const Trail& trailHand ); + + +const int outsideOfRadiusValue = -1000000; +const int insideOfRadiusValue = 0; + +// Fills the complement of the radius of the trail with minus infties. +// The return value true means success. Failure means that during the fill, +// we intersected the outside of the quasidiagonal area. +// In this case, the operation is not finished. +bool borderDetailedAlignMatrix( AlignMatrix& m, const Trail& trail, int radius ); + +// What the name implies. +void dumpAlignMatrix( const AlignMatrix& m, bool graphical ); + +template +void dumpAlignMatrix( const QuasiDiagonal& alignMatrix ); + +void dumpAlignMatrix( const QuasiDiagonal& alignMatrix, bool graphical ); + +void dumpTrelliMatrix( const TrelliMatrix& trellis ); + + +} // namespace TMXAligner + +#endif // #define __TMXALIGNER_ALIGNMENT_ALIGNMENT_H Index: branches/apertium-tagger/apertium2/apertium/tmx_arguments_parser.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_arguments_parser.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_arguments_parser.cc (revision 69632) @@ -0,0 +1,203 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#include +#include +#include + +// Could be better. +bool alphabetic( char c) +{ + return ((c>='a')&&(c<='z')) || ((c>='A')&&(c<='Z')) || (c=='_'); +} + +bool Arguments::read( int argc, char **argv ) +{ + for ( int i=1; i& remains ) +{ + remains.clear(); + + for ( int i=1; isecond.kind != AnyData::Int) + { + std::cerr << "Argument -" << name << ": integer expected.\n"; + throw "argument error"; + } + + num = it->second.dInt; + erase(name); + return true; +} + +bool Arguments::getSwitchConst( const ArgName& name, bool& sw ) const +{ + const_iterator it=find(name); + if (it==end()) + { + sw = false; + return true; + } + else if (! it->second.dString.empty()) + { + std::cerr << "Argument -" << name << ": value is not allowed.\n"; + return false; + } + else + { + sw = true; + return true; + } +} + +bool Arguments::getSwitch( const ArgName& name, bool& sw ) +{ + bool ok = getSwitchConst(name, sw); + if (ok) + erase(name); + + return ok; +} + +bool Arguments::getSwitchCompact( const ArgName& name ) +{ + bool sw(false); + bool ok = getSwitchConst(name, sw); + if (ok) + { + erase(name); + return sw; + } + else + { + std::cerr << "No value is allowed for argument -" << name << ".\n"; + throw "argument error"; + } +} + +void Arguments::checkEmptyArgs() const +{ + if (!empty()) + { + std::cerr << "Invalid argument: "; + + for ( Arguments::const_iterator it=begin(); it!=end(); ++it ) + { + std::cerr << "-" << it->first; + if (!it->second.dString.empty()) + std::cerr << "=" << it->second.dString; + std::cerr << " "; + } + std::cerr << std::endl; + + throw "argument error"; + } +} Index: branches/apertium-tagger/apertium2/apertium/tmx_arguments_parser.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_arguments_parser.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_arguments_parser.h (revision 69632) @@ -0,0 +1,72 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __ARGUMENTSPARSER_H +#define __ARGUMENTSPARSER_H + +#include +#include +#include + +// Current usage and limitations: +// Every argument starts with a '-'. +// It is a key/value pair. The delimiter +// is either the first '=' (erased), or the +// first nonalphabetic character (not erased). + +class AnyData +{ +public: + enum Kind { Int, String, Float, Set }; + +public: + AnyData() : kind(String), dInt(-1) {} + AnyData( const int& d ) : kind(Int), dInt(d) {} + AnyData( const std::string& d ) : kind(String), dInt(-1), dString(d) {} + +public: + Kind kind; + int dInt; + std::string dString; +}; + +typedef std::string ArgName; +typedef std::map< ArgName, AnyData > ArgumentMap; + +class Arguments : public ArgumentMap +{ +public: + // Very important note: When read finds a numeric/set argument, + // it sets anyData.kind to Int. But STILL, it fills anyData.dString, + // just in case. So if the ArgumentMap was built by Arguments::read, + // the dString fields are all filled. + bool read( int argc, char **argv ); + + // remains is filled with the arguments not starting with '-'. + bool read( int argc, char **argv, std::vector& remains ); + + // const if fails, erases arg if succeeds. + bool getNumericParam( const ArgName& name, int& num ); + + // sw is true if the switch is present. The function + // returns false if the argument value is not empty. + bool getSwitch( const ArgName& name, bool& sw ); + + bool getSwitchConst( const ArgName& name, bool& sw ) const; + + // Returns true if the switch is present. Throws an error message if + // if the argument value is not empty. + bool getSwitchCompact( const ArgName& name ); + + void checkEmptyArgs() const; +}; + +#endif // #define __ARGUMENTSPARSER_H Index: branches/apertium-tagger/apertium2/apertium/tmx_book_to_matrix.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_book_to_matrix.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_book_to_matrix.cc (revision 69632) @@ -0,0 +1,382 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#include +#include +#include // For IBMModelOne + +#include +#include + +#include // Just for similarityEvaluator, which should go anyway. TODO. + +namespace TMXAligner +{ + + +// (!!!) We assert that sx and sy are ordered sets of Word-s! +int intersectionSize( const WordList& sx, const WordList& sy ) +{ + int inter=0; + WordList::const_iterator sxt = sx.begin(); + WordList::const_iterator syt = sy.begin(); + WordList::const_iterator sxe = sx.end(); + WordList::const_iterator sye = sy.end(); + for ( ; sxt!=sxe && syt!=sye ; ) + { + if ( *sxt < *syt ) + ++sxt; + else if ( *sxt > *syt ) + ++syt; + else + { + ++inter; + ++sxt; + ++syt; + } + } + return inter; +} + +bool isNumber( const std::string& s ) +{ + int n = s.size(); + for ( int i=0; i'9') ) + { + return false; + } + } + return true; +} + +// (!!!) We assert that sx and sy are ordered sets of Word-s! +int specializedIntersectionSize( const WordList& sx, const WordList& sy ) +{ + int inter=0; + WordList::const_iterator sxt = sx.begin(); + WordList::const_iterator syt = sy.begin(); + WordList::const_iterator sxe = sx.end(); + WordList::const_iterator sye = sy.end(); + + int numberOfDifferingNumbers = 0; + int numberOfSameNumbers = 0; + + for ( ; sxt!=sxe && syt!=sye ; ) + { + if ( *sxt < *syt ) + { + if (isNumber(*sxt)) + { + ++numberOfDifferingNumbers; + } + ++sxt; + } + else if ( *sxt > *syt ) + { + if (isNumber(*syt)) + { + ++numberOfDifferingNumbers; + } + ++syt; + } + else + { + if (isNumber(*syt)) + { + ++numberOfSameNumbers; + } + ++inter; + ++sxt; + ++syt; + } + } + + if ( (numberOfSameNumbers>0) && ( numberOfDifferingNumbers <= numberOfSameNumbers/5 ) ) + { + inter += 10; + } + + return inter; +} + +const std::string paragraphString = "

"; + +bool isParagraph( const Phrase& phrase ) +{ + return ( (phrase.size()==1) && (phrase[0]==paragraphString) ); +} + +bool exceptionalScoring( const Phrase& hu, const Phrase& en, double& score ) +{ + bool huIsParagraph = isParagraph(hu); + bool enIsParagraph = isParagraph(en); + + // We like it if both are paragraph delimiters + if ( huIsParagraph && enIsParagraph ) + { + score = scoreOfParagraphMatch; + return true; + } + + if ( huIsParagraph || enIsParagraph ) + { + score = scoreOfParagraphMisMatch; + return true; + } + + return false; +} + + +const double maximumScore = 3.0; + +double scoreByIdentity( const Phrase& hu, const Phrase& en ) +{ + double score = 0; + if ( ! exceptionalScoring( hu, en, score ) ) + { + score = specializedIntersectionSize( hu, en ); + + // If we divide with max here, we are better at avoiding global mistakes. + // If we divide with min here, we are better at avoiding local mistakes. + // I think. This is just a theory. :) + // What is fact? If we divide with min, we give higher scores to valid 2-to-1 segments. + // But we make silly mistakes because we give higher scores to some invalid 1-to-1 segments like this: + // Kocogtam. -Like I said, I was out jogging-- -ObviousIy, you weren't jogging. + // Remember the day that they threw you out? + // + // Hopefully Gale-Church scoring compensates for this. Sometimes does not compensate enough. + score /= ( (hu.size()=(unsigned char)192)) + { + ++length; + } + } + return length; + } + else + { + return word.size(); + } +} + +double characterLength( const Phrase& words, bool utfCharCountingMode ) +{ + // A space ennyi betut er: + const double spaceValue = 0; // 1.5; + + + if (isParagraph(words)) + { + return paragraphDelimiterFictiveLength; + } + + double sum(0); + for ( size_t i=0; i +#include + +namespace TMXAligner +{ + +const double scoreOfParagraphMatch = 0.31; + +const double scoreOfParagraphMisMatch = -1.0; + +bool isParagraph( const Phrase& phrase ); + +// (!!!) We assert that sx and sy are ordered sets of Word-s! +int intersectionSize( const WordList& sx, const WordList& sy ); + +void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix ); + +class TransLex; + +double scoreByIdentity( const Phrase& hu, const Phrase& en ); + +double scoreByTranslation( const Phrase& hu, const Phrase& en, const TransLex& transLex ); + +// This is much-much slower, but instead of identity, uses a many-to-many dictionary. +// For performance reasons, by convention does not calculate the similarity if the +// alignMatrix element contains outsideOfRadiusValue, a big negative number. +void sentenceListsToAlignMatrixTranslation( + const SentenceList& huSentenceListPretty, const SentenceList& enSentenceList, + const TransLex& transLex, + AlignMatrix& alignMatrixDetailed ); + +class IBMModelOne; + +void sentenceListsToAlignMatrixIBMModelOne( + const SentenceList& huSentenceList, const SentenceList& enSentenceList, + const IBMModelOne& modelOne, + AlignMatrix& alignMatrix ); + +int characterLength( const Word& words, bool utfCharCountingMode=false ); + +double characterLength( const Phrase& words, bool utfCharCountingMode=false ); + + +double characterLength( int start, int end, const SentenceList& sentenceList, bool utfCharCountingMode=false ); + +void setSentenceValues( const SentenceList& sentences, SentenceValues& lengths, bool utfCharCountingMode ); + +} // namespace TMXAligner + +#endif // #define __TMXALIGNER_ALIGNMENT_BOOKTOMATRIX_H Index: branches/apertium-tagger/apertium2/apertium/tmx_dic_tree.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_dic_tree.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_dic_tree.h (revision 69632) @@ -0,0 +1,215 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __TMXALIGNER_TEIREADER_DICTIONARIES_H +#define __TMXALIGNER_TEIREADER_DICTIONARIES_H + +#include +#include +#include +#include + +namespace TMXAligner +{ + +// A simple tree class. +// +template +class DicTree +{ +public: + // Gets value a bit below. Ugly C++. + static const bool WarnOnConflict; + + DicTree() : id(0) {} + DicTree( const Identifier& id_ ) : id(id_) {} + + ~DicTree(); + + const Identifier& getIdentifier() const { return id; } + void setIdentifier( const Identifier& id_) { id=id_; } + DicTree* lookup( const Atom& word ) const; + DicTree& add( const Atom& word, const Identifier& id ); + bool empty() const { return children.empty(); } + + void dump( std::ostream& os ) const; + +private: + typedef std::map DicTreeMap; + DicTreeMap children; + Identifier id; +}; + +template +const bool DicTree::WarnOnConflict = false; + +// This structure stores a very sparse set-system of words. +// (A dictionary of complex expressions.) +// +// It supports the following query: +// It receives a set of words S. It gives back the sets +// of the set system that are contained in this set S. +// +// For it to be effective, we must be careful during the building phase: +// words in vector 'words' must be ordered by INCREASING frequency. Rare words first. + +template +class SubsetLookup +{ +public: + + typedef std::vector Atoms; + + void add( const Atoms& words, const Identifier& id ); + + void lookup( const Atoms& words, std::set& results ) const; + + void dump( std::ostream& os ) const; + +private: + DicTree tree; +}; + +// Implementation. F.ck C++ for having to put this in a header. + +template +DicTree::~DicTree() +{ + for ( typename DicTreeMap::iterator it=children.begin(); it!=children.end(); ++it ) + { + delete it->second; + } +} + +// Az id-t soha nem irja at nullarol nemnullara. +// Ha nemnullarol nemnullara irja at, akkor kiabal elotte. +template +DicTree& DicTree::add( const Atom& word, const Identifier& id ) +{ + DicTree* v = lookup(word); + if (!v) + { + v = new DicTree(); + v->id = id; + children[word] = v; + } + else + { + if ( ( v->id != 0 ) && ( id != 0 ) ) + { + if (WarnOnConflict) + std::cerr << "warning: conflict in tree" << std::endl; + } + if ( id != 0 ) + { + v->id = id; + } + } + + return (*v); +} + +template +DicTree* DicTree::lookup( const Atom& word ) const +{ + typename DicTreeMap::const_iterator ft = children.find(word); + + if (ft==children.end()) + { + return 0; + } + else + { + return ft->second; + } +} + +template +void DicTree::dump( std::ostream& os ) const +{ + if (id!=0) + { + os << id << " "; + } + os << "{" << std::endl; + for ( typename DicTreeMap::const_iterator it=children.begin(); it!=children.end(); ++it ) + { + os << it->first << " "; + it->second->dump(os); + } + os << "}" << std::endl; +} + +template +void SubsetLookup::add( const Atoms& words, const Identifier& id ) +{ + DicTree* v = &tree; + + for ( typename Atoms::const_iterator it=words.begin(); it!=words.end(); ++it ) + { + DicTree& newv = v->add(*it,0); + v = &newv; + } + if ( v->getIdentifier() == 0 ) + { + v->setIdentifier(id); + } + else + { + if (DicTree::WarnOnConflict) + std::cerr << "warning: conflict in tree" << std::endl; + } +} + +template +void SubsetLookup::lookup( const Atoms& words, std::set& results ) const +{ + typedef std::set*> Pebbles; + Pebbles pebbles; + pebbles.insert(&tree); + + results.clear(); + + for ( typename Atoms::const_iterator it=words.begin(); it!=words.end(); ++it ) + { + const Atom& word = *it; + + for ( typename Pebbles::const_iterator jt=pebbles.begin(); jt!=pebbles.end(); ++jt ) + { + const DicTree* subTree = (*jt)->lookup(word) ; + + if (!subTree) + continue; + + const Identifier& id = subTree->getIdentifier(); + if (id!=0) + { + results.insert(id); + } + + if (!subTree->empty()) + { + pebbles.insert(subTree); + } + } + } +} + +template +void SubsetLookup::dump( std::ostream& os ) const +{ + tree.dump(os); +} + +} // namespace TMXAligner + + +#endif // #define __TMXALIGNER_TEIREADER_DICTIONARIES_H Index: branches/apertium-tagger/apertium2/apertium/tmx_dictionary.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_dictionary.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_dictionary.cc (revision 69632) @@ -0,0 +1,671 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#define massert(e) if (!(e)) { std::cerr << #e << " failed" << std::endl; throw "assert"; } + +namespace TMXAligner +{ + +void eatwhite( std::istream& is ) +{ + while (true) + { + char c=is.peek(); + if ( (c!=' ') && (c!='\t') ) + { + break; + } + is.ignore(); + if (is.eof()) + break; + } +} + +void read( WordList& ph, std::istream& is ) +{ + ph.clear(); + + while (true) + { + if (is.eof()) + { + break; + } + if (is.peek()=='\r') + { + is.ignore(); + } + if (is.peek()=='\n') + { + is.ignore(); + break; + } + + Word w; + is >> w; + + eatwhite(is); + + if (w.empty()) + break; + + ph.push_back(w); + } +} + +void SentenceList::read( std::istream& is ) +{ + clear(); + + while (!is.eof()) + { + Sentence sentence; + + is >> sentence.id; + + if (sentence.id.empty()) + break; + + if (is.peek()!='\t') + break; + is.ignore(); + + TMXAligner::read( sentence.words, is ); + + push_back(sentence); + } +} + +void SentenceList::readNoIds( std::istream& is ) +{ + clear(); + + while ( (is.good()) && (!is.eof()) ) + { + Sentence sentence; + + TMXAligner::read( sentence.words, is ); + + push_back(sentence); + } +} + +void SentenceList::write( std::ostream& os ) const +{ + for ( size_t i=0; i halfs; + std::getline(is,line,'\n'); + + if (line.empty()) + { + break; + } + + split( line, halfs ); + if (halfs.size()!=2) + { + std::cerr << "Incorrect bicorpus file: " << halfs.size() << " records in line " << huSentenceList.size() << std::endl; + throw "data error"; + } + + { + std::istringstream iss(halfs[0]); + + Sentence sentence; + read( sentence.words, iss ); + + huSentenceList.push_back(sentence); + } + { + std::istringstream iss(halfs[1]); + + Sentence sentence; + read( sentence.words, iss ); + + enSentenceList.push_back(sentence); + } + } +} + +void writeBicorpus( std::ostream& os, const SentenceList& huSentenceList, const SentenceList& enSentenceList) +{ + assert(huSentenceList.size()==enSentenceList.size()); + + for ( size_t i=0; i> w; + + if (w.empty()) + break; + + // We allow vonyo7's "@" delimiter, and vonyokornai's "@V", "@N" etc. delimiters. + if ( (w.size()<=2) && (w[0]=='@') ) + { + engPart = false; + delimiter = w; + } + else if (engPart) + { + en.push_back(w); + } + else + { + hu.push_back(w); + } + + while ( (is.peek()==' ') || (is.peek()=='\r') ) + { + is.ignore(); + } + + if (is.peek()=='\n') + { + is.ignore(); + break; + } + } + + if (en.empty()) + break; + + push_back(std::make_pair(en,hu)); + + } + +} + + +void Dictionary::read( const char* dictionaryFile ) +{ + throw "unimplemented"; +} + +void Dictionary::build( const DictionaryItems& dictionaryItems ) +{ + throw "unimplemented"; +} + +void Dictionary::reverse( const Dictionary& dic ) +{ + throw "unimplemented"; +} + +bool Dictionary::lookupWord( const Word& word, DictionaryItems& results ) const +{ + return false; +} + +bool Dictionary::lookupWordSet( const WordList& words, DictionaryItems& results ) const +{ + return false; +} + +void FrequencyMap::add( const Word& word ) +{ + ++operator[](word); +} + +void FrequencyMap::remove( const Word& word ) +{ + --operator[](word); +} + +void FrequencyMap::build( const WordList& wordList ) +{ + for ( size_t j=0; jsecond; + } + return totalItemNum; +} + +void FrequencyMap::dump( std::ostream& os, int itemNum ) const +{ + FrequencyMap::ReFrequencyMap reFrequencyMap; + reverseMap(reFrequencyMap); + + FrequencyMap::ReFrequencyMap::reverse_iterator rit; + for ( rit=reFrequencyMap.rbegin(); rit!=reFrequencyMap.rend(); ++rit ) + { + os << rit->first << "\t" << rit->second << "\n"; + + --itemNum; + if (itemNum==0) + break; + } + os.flush(); +} + +void FrequencyMap::highPassFilter( WordList& allowedWords, double ratio ) const +{ + allowedWords.clear(); + + FrequencyMap::ReFrequencyMap reFrequencyMap; + reverseMap(reFrequencyMap); + + FrequencyMap::ReFrequencyMap::reverse_iterator rit; + + int totalItemNum = total(); + + int localItemNum(0); + for ( rit=reFrequencyMap.rbegin(); rit!=reFrequencyMap.rend(); ++rit ) + { + localItemNum += rit->first; + if ( ((double)localItemNum)/totalItemNum > ratio ) + break; + + allowedWords.push_back(rit->second); + } +} + +void FrequencyMap::lowPassFilter( WordList& allowedWords, double ratio ) const +{ + allowedWords.clear(); + + FrequencyMap::ReFrequencyMap reFrequencyMap; + reverseMap(reFrequencyMap); + + FrequencyMap::ReFrequencyMap::iterator rit; + + int totalItemNum = total(); + + int localItemNum(0); + for ( rit=reFrequencyMap.begin(); rit!=reFrequencyMap.end(); ++rit ) + { + localItemNum += rit->first; + + if ( ((double)localItemNum)/totalItemNum > ratio ) + break; + + allowedWords.push_back(rit->second); + } +} + +void FrequencyMap::reverseMap( FrequencyMap::ReFrequencyMap& reFrequencyMap ) const +{ + reFrequencyMap.clear(); + + for ( const_iterator it=begin(); it!=end(); ++it ) + { + reFrequencyMap.insert( FrequencyMap::ReFrequencyMap::value_type(it->second,it->first) ); + } +} + + +void filterSentences( SentenceList& sentenceList, const WordList& words ) +{ + std::set wordSet; + + for (size_t i=0; i& words ) +{ + words.clear(); + const char** currWordsPtr=wordsPtr; + while (**currWordsPtr!='\0') + { + words.insert(*currWordsPtr); + ++currWordsPtr; + } +} + +void removeHungarianStopwords( SentenceList& huSentenceList ) +{ + const char* huStopwordsC[] = + { + "a", "az", + + "egy", + + "és", + + "nem", "ne", + + "is", + + "van", + + "ő", + + "ha", + + "" + }; + + std::set stopwords; + cStyleStringsToStringSet( huStopwordsC, stopwords ); + + + for ( size_t i=0; i stopwords; + cStyleStringsToStringSet( enStopwordsC, stopwords ); + + + for (size_t i=0; isecond == enWord) + { + return true; + } + } + return false; +} + +double IBMModelOne::lookup( const Word& hu, const Word& en ) const +{ + TransProbs::const_iterator ft = transProbs.find( std::make_pair(hu,en) ); + + if (ft==transProbs.end()) + { + return 0; + } + else + { + return ft->second; + } +} + +void IBMModelOne::build( const SentenceList& huSentenceList, const SentenceList& enSentenceList ) +{ + transProbs.clear(); + + massert( huSentenceList.size()==enSentenceList.size() ); + + + std::map huProb; + + for ( size_t sen=0; sensecond /= huProb[it->first.first]; + } +} + +void IBMModelOne::reestimate( const SentenceList& huSentenceList, const SentenceList& enSentenceList ) +{ + throw "unimplemented"; +} + +// +double IBMModelOne::distance( const Phrase& hu, const Phrase& en ) const +{ + double val = log(1.0+hu.size()) / en.size() ; + + for ( size_t enPos=0; enPos0 ); + + val -= log(sum); + } + + throw "unimplemented"; +} + +} // namespace TMXAligner Index: branches/apertium-tagger/apertium2/apertium/tmx_dictionary.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_dictionary.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_dictionary.h (revision 69632) @@ -0,0 +1,131 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __TMXALIGNER_ALIGNMENT_DICTIONARY_H +#define __TMXALIGNER_ALIGNMENT_DICTIONARY_H + +#include + +#include +#include +#include +#include + + +namespace TMXAligner +{ + +typedef std::pair DictionaryItem; + +class DictionaryItems : public std::vector +{ +public: + void read( std::istream& is ); +}; + +class HalfDictionary : public std::vector +{ +public: + void read( std::istream& is ); +}; + + +// After reading, this dictionary cannot be altered. +// Also, this is a strictly one-directional dictionary. +// If the other direction is needed, reverse( const Dictionary& dic ) another dictionary. +class Dictionary +{ +public: + void read( const char* dictionaryFile ); + void reverse( const Dictionary& dic ); + void build( const DictionaryItems& dictionaryItems ); + + bool lookupWord( const Word& word, DictionaryItems& results ) const; + bool lookupWordSet( const WordList& words, DictionaryItems& results ) const; + +private: + void buildWordLookupTable(); + +private: + DictionaryItems dictionaryItems; + + typedef std::map wordLookupTable; +}; + +class FrequencyMap : public std::map +{ +public: + void add( const Word& word ); + void remove( const Word& word ); + void build( const WordList& wordList ); + void remove( const WordList& wordList ); + void build( const SentenceList& sentenceList ); // Just for convenience. + int total() const; + void dump( std::ostream& os, int itemNum ) const; + void lowPassFilter( WordList& allowedWords, double ratio ) const; + void highPassFilter( WordList& allowedWords, double ratio ) const; + +private: + typedef std::multimap ReFrequencyMap; + void reverseMap( ReFrequencyMap& reFrequencyMap ) const; +}; + + +void filterSentences( SentenceList& sentenceList, const WordList& words ); + +void removeHungarianStopwords( SentenceList& huSentenceList ); +void removeEnglishStopwords ( SentenceList& enSentenceList ); +void removeStopwords ( SentenceList& huSentenceList, SentenceList& enSentenceList ); + + +typedef std::pair WordPair; + +class TransLex +{ +public: + + typedef std::multimap WordMultimap; + typedef WordMultimap::const_iterator WordMultimapIt; + typedef std::pair DictInterval; + + void add( const Word& huWord, const Word& enWord ); + void build( const DictionaryItems& dictionaryItems ); + + DictInterval lookupLeftWord ( const Word& huWord ) const; + DictInterval lookupRightWord( const Word& enWord ) const; + bool isPresent( const Word& huWord, const Word& enWord ) const; + +private: + WordMultimap forward; + WordMultimap backward; +}; + +class IBMModelOne +{ +public: + double lookup( const Word& hu, const Word& en ) const; + + double distance( const Phrase& hu, const Phrase& en ) const; + + void build( const SentenceList& huSentenceList, const SentenceList& enSentenceList ); + + void reestimate( const SentenceList& huSentenceList, const SentenceList& enSentenceList ); + +public: + typedef std::pair WordPair; + typedef std::map TransProbs; + + TransProbs transProbs; +}; + +} // namespace TMXAligner + +#endif // #define __TMXALIGNER_ALIGNMENT_DICTIONARY_H Index: branches/apertium-tagger/apertium2/apertium/tmx_quasi_diagonal.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_quasi_diagonal.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_quasi_diagonal.h (revision 69632) @@ -0,0 +1,171 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __TMXALIGNER_ALIGNMENT_QUASIDIAGONAL_H +#define __TMXALIGNER_ALIGNMENT_QUASIDIAGONAL_H + +#include + +namespace TMXAligner +{ + +template +class QuasiDiagonal +{ +public: + + // Quite slow, because of the many bounds checks. + class QuasiDiagonalRow + { + public: + + // QuasiDiagonalRow is similar to a vector of size size_. The difference is + // that only the [offset_,offset_+thickness) subinterval can be written. + // Reading from outside this interval yields the default T(). + // Reading from outside the [0,size) interval yields a throw. + // It is NOT asserted that [offset_,offset_+thickness) + // should be a subset of [0,size). + // + QuasiDiagonalRow( int size_=0, int offset_=0, int thickness=0, T outsideDefault_=T() ) + : offset(offset_), size(size_), data(thickness,T()), outsideDefault(outsideDefault_) {} + + enum ZoneType + { + DiagZone = 1, + MatrixZone = 2, + OutsideZone = 3 + }; + + ZoneType zone(int k) const + { + if ( ! ((k>=0) && (k=0) && (d<(int)data.size()) ) + { + return DiagZone; + } + else + { + return MatrixZone; + } + } + + const T& operator[](int k) const + { + if ( ! ((k>=0) && (k=0) && (d<(int)data.size()) ) + { + return data[k-offset]; + } + else + { + return outsideDefault; + } + } + + T& cell(int k) + { + if ( ! ((k>=0) && (k=0) && (d<(int)data.size()) ) + { + return data[k-offset]; + } + else + { + throw "out of quasidiagonal"; + } + } + + private: + int offset; + int size; + std::vector data; + T outsideDefault; + }; + + QuasiDiagonal( int height_, int width_, int thickness_, T outsideDefault_=T() ) + : height(height_), width(width_), thicknes(thickness_) + { + for ( int i=0; i0 ? s : 0 ); + } + + int rowEnd( int row ) const + { + int e=offset(row)+thicknes; + return ( e=height)) + { + throw "out of matrix"; + } + + return rows[y].cell(x); + } + + bool setCell( int y, int x, const T& t ) + { + cell(y,x) = t; + return true; + } + + int size() const { return height; } + // Yes, I know it's a stupid name. The reason is, I don't want to + // put width/height on the interface, because usually + // the first coord is the columns, but not here. + // This could lead to confusion. + int otherSize() const { return width; } + + int thickness() const { return thicknes; } + +private: + std::vector rows; + int height,width,thicknes; +}; + +} // namespace TMXAligner + +#endif // #define __TMXALIGNER_ALIGNMENT_QUASIDIAGONAL_H Index: branches/apertium-tagger/apertium2/apertium/tmx_serialize_impl.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_serialize_impl.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_serialize_impl.h (revision 69632) @@ -0,0 +1,52 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __TMXALIGNER_INCLUDE_SERIALIZEIMPL_H +#define __TMXALIGNER_INCLUDE_SERIALIZEIMPL_H + +#include +#include +#include + +template +std::ostream& operator<<( std::ostream& os, const std::vector& v ) +{ + for ( typename std::vector::const_iterator it=v.begin(); it!=v.end(); ++it ) + { + os << *it ; + if (it+1!=v.end()) + os << " "; + } + return os; +} + +template +std::ostream& operator<<( std::ostream& os, const std::set& v ) +{ + typename std::set::const_iterator it=v.begin(); + while (true) + { + os << *it ; + + typename std::set::const_iterator itplus = it; + ++itplus; + + if (itplus == v.end()) + break; + else + os << " "; + + it = itplus; + } + return os; +} + +#endif // #define __TMXALIGNER_INCLUDE_SERIALIZEIMPL_H Index: branches/apertium-tagger/apertium2/apertium/tmx_strings_and_streams.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_strings_and_streams.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_strings_and_streams.cc (revision 69632) @@ -0,0 +1,38 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#include + +namespace TMXAligner +{ + +void split( const std::string line, std::vector& words, char delim /*='\t'*/ ) +{ + words.clear(); + + std::string current; + + for (size_t i=0; i +#include + +namespace TMXAligner +{ + +void split( const std::string line, std::vector& words, char delim='\t' ); + +} // namespace TMXAligner + +#endif // #define __TMXALIGNER_INCLUDE_STRINGSANDSTREAMS_H Index: branches/apertium-tagger/apertium2/apertium/tmx_trail_postprocessors.cc =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_trail_postprocessors.cc (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_trail_postprocessors.cc (revision 69632) @@ -0,0 +1,481 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#include + +#include +#include + +#include +#include + +namespace TMXAligner +{ + +const bool global_postprocessLogging = false; + +TrailScores::TrailScores( const Trail& trail_, const AlignMatrix& dynMatrix_ ) : trail(trail_), dynMatrix(dynMatrix_) {} + +double TrailScores::operator()( int j ) const +{ + return + dynMatrix[trail[j ].first][trail[j ].second] + - + dynMatrix[trail[j+1].first][trail[j+1].second] ; +} + +BisentenceListScores::BisentenceListScores( const BisentenceList& bisentenceList_, const AlignMatrix& dynMatrix_ ) + : bisentenceList(bisentenceList_), dynMatrix(dynMatrix_) {} + +double BisentenceListScores::operator()( int j ) const +{ + return + dynMatrix[bisentenceList[j].first ][bisentenceList[j].second] + - + dynMatrix[bisentenceList[j].first+1][bisentenceList[j].second+1] ; +} + + +TrailScoresInterval::TrailScoresInterval( const Trail& trail_, const AlignMatrix& dynMatrix_, + const SentenceList& huSentenceList_, const SentenceList& enSentenceList_ ) + : trail(trail_), dynMatrix(dynMatrix_), huSentenceList(huSentenceList_), enSentenceList(enSentenceList_) {} + +// The average score of the jth segmentum. The bigger the better. +// Division is by the maximum of the Hungarian and English intervals. +// This is a somewhat arbritary decision, and goes very badly with the +// scoring of the knight's moves. But we really have no better choice. +// +// Also, the method applies some very ugly hacks to avoid the effect of +// paragraph-delimiters. It strips both intervals of

s, and +// modifies the dynMatrix-based score assuming that all

s got paired. +// except surplus

s. +double TrailScoresInterval::scoreSegmentum( const Rundle& start, const Rundle& end ) const +{ + int huDiff = end.first - start.first ; + int enDiff = end.second - start.second ; + + double score = + dynMatrix[start.first][start.second] + - + dynMatrix[end. first][end. second] ; + + int i; + int huParagraphNum(0), enParagraphNum(0) ; + for ( i=start.first; ienParagraphNum ? huParagraphNum : enParagraphNum ) - estimatedParagraphMatches ; + + double scoreDeviationBecauseOfThoseStupidParagraphs = + scoreOfParagraphMatch * estimatedParagraphMatches + skipScore * estimatedParagraphMismatches; + + int huDiffParagraphAdjusted = huDiff - huParagraphNum ; + int enDiffParagraphAdjusted = enDiff - enParagraphNum ; + + int maxDiffParagraphAdjusted = huDiffParagraphAdjusted>enDiffParagraphAdjusted ? huDiffParagraphAdjusted : enDiffParagraphAdjusted ; + + if (maxDiffParagraphAdjusted==0) + { + return 0; + } + else + { + return ( score - scoreDeviationBecauseOfThoseStupidParagraphs ) / maxDiffParagraphAdjusted ; + } +} + +// The score of the jth segmentum. The bigger the better. +double TrailScoresInterval::operator()( int j ) const +{ + Rundle start = trail[j]; + Rundle end = trail[j+1]; + + return scoreSegmentum( start, end ); +} + +double TrailScoresInterval::operator()( int j, int k ) const +{ + Rundle start = trail[j]; + Rundle end = trail[k]; + + return scoreSegmentum( start, end ); +} + + +void removeRundles( Trail& trail, const std::set& rundlesToKill ) +{ + // Not a speed bottleneck. + Trail newTrail; + for ( size_t i=0; i0 ? x : -x ); +} + +// Egy zero-to-nonzero hole valamelyik oldalan levo rundle-t kiirtom, ha a +// rundle torlese kozeliti az uj hezagban a magyar karakterszam / angol karakterszam +// hanyadost egyhez. A bal es a jobb kozul azt valasztom, amelyik tobbet javit. +// +// Meg akkor is olvasztok, ha ezzel kicsit rontok, mivel a valodi zero-to-one eleg ritka. +// Legalabbis regenyekben. Az improvementSlack konstansnak domainfuggonek kellene lennie. +void spaceOutBySentenceLength( Trail& bestTrail, + const SentenceList& huSentenceListPretty, + const SentenceList& enSentenceList, + bool utfCharCountingMode ) +{ + // i most egy hole es nem egy rundle indexe. + for ( size_t i=1; iimprovementSlack) || (improvesRight>improvementSlack) ) + { + bool eraseLeft = (improvesLeft>improvesRight); + + if (eraseLeft) + { + bestTrail.erase(bestTrail.begin()+i); + } + else + { + bestTrail.erase(bestTrail.begin()+i+1); + } + + } + else + { + ++i; + } + } + else + { + ++i; + } + } +} + + +// The function gets a nonconst reference to bestTrail. +// On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. +// Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. +void postprocessTrailStart( Trail& bestTrail, + const TrailScoresInterval& trailScoresInterval, + const double& qualityThreshold ) +{ + const int window = 10; + + std::set rundlesToKill; + + int trailSize = bestTrail.size(); + + for ( int pos=1; pos rundlesToKill; + + int trailSize = bestTrail.size(); + + for ( int pos=trailSize-1-window-1; pos>0; --pos ) + { + double avg = trailScoresInterval( pos, pos+window ); + + if (avg rundlesToKill; + + int trailSize = bestTrail.size(); + + for ( int pos=1; pos rundlesToKill; + + int trailSize = bestTrail.size(); + + for ( int pos=1; pos=qualityThreshold) ) + { + bisentenceList.push_back(bestTrail[pos]); + } + } +} + + +// This is basically incorrect. +// Here we use the score of the right-hand segment to decide about the rundle. +// +// The function gets a nonconst reference to bestTrail. +// On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. +// Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. +void filterTrailByQuality( Trail& trail, const TrailScoresInterval& trailScoresInterval, + const double& qualityThreshold ) +{ + Trail newTrail; + + newTrail.push_back(trail.front()); + for ( size_t i=1; i= qualityThreshold ) + { + newTrail.push_back(trail[i]); + } + } + newTrail.push_back(trail.back()); + + trail = newTrail; +} + +void filterBisentenceListByQuality( BisentenceList& bisentenceList, const AlignMatrix& dynMatrix, + const double& qualityThreshold ) +{ + BisentenceList newBisentenceList; + + BisentenceListScores bisentenceListScores(bisentenceList,dynMatrix); + + for ( size_t i=0; i= qualityThreshold ) + { + newBisentenceList.push_back(bisentenceList[i]); + } + } + + bisentenceList = newBisentenceList; +} + +} // namespace TMXAligner Index: branches/apertium-tagger/apertium2/apertium/tmx_trail_postprocessors.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_trail_postprocessors.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_trail_postprocessors.h (revision 69632) @@ -0,0 +1,142 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __TMXALIGNER_ALIGNMENT_TRAILPOSTPROCESSORS_H +#define __TMXALIGNER_ALIGNMENT_TRAILPOSTPROCESSORS_H + +#include + +namespace TMXAligner +{ + +// Helper class that calculates scores of holes. +class TrailScores +{ +public: + TrailScores( const Trail& trail_, const AlignMatrix& dynMatrix_ ); + // The score of the jth segmentum. The bigger the better. + double operator()( int j ) const; + +private: + const Trail& trail; + const AlignMatrix& dynMatrix; +}; + + +class SentenceList; + + +// Helper class that calculates scores of segmentums. +class TrailScoresInterval +{ +public: + TrailScoresInterval( const Trail& trail_, const AlignMatrix& dynMatrix_, + const SentenceList& huSentenceList_, const SentenceList& enSentenceList_ ); + + // The average score of the jth segmentum. The bigger the better. + // Division is by the maximum of the Hungarian and English intervals. + // This is a somewhat arbritary decision, and goes very badly with the + // scoring of the knight's moves. But we really have no better choice. + // + // Also, the method applies some very ugly hacks to avoid the effect of + // paragraph-delimiters. It strips both intervals of

s, and + // modifies the dynMatrix-based score assuming that all

s got paired. + // except surplus

s. + double scoreSegmentum( const Rundle& start, const Rundle& end ) const; + + // The score of a segment identified by its index. + double operator()( int j ) const; + // The score of a union of segments identified by its start and end rundles' index. + // Both these methods rely on scoreSegmentum(): + // This means an important thing: the score only depends + // on the start and end rundle, not the rundles in between. + double operator()( int j, int k ) const; + +private: + const Trail& trail; + const AlignMatrix& dynMatrix; + const SentenceList& huSentenceList; + const SentenceList& enSentenceList; +}; + +// Helper class that calculates scores of one-to-one holes. +class BisentenceListScores +{ +public: + BisentenceListScores( const BisentenceList& bisentenceList_, const AlignMatrix& dynMatrix_ ); + // The score of the jth bisentence. The bigger the better. + double operator()( int j ) const; + +private: + const BisentenceList& bisentenceList; + const AlignMatrix& dynMatrix; +}; + +void removeRundles( Trail& trail, const std::set& rundlesToKill ); + +// In cautious mode, auto-aligned rundles are thrown away if +// their left or right neighbour holes are not one-to-one. +// From the point of view of the resultant bisentences: +// In cautious mode, one-to-one bisentences are thrown away if +// they have left or right neighbours which are not one-to-one. +// This of course dramatically improves precision while slightly degrading recall. +void cautiouslyFilterTrail( Trail& bestTrail ); + +void spaceOutBySentenceLength( Trail& bestTrail, + const SentenceList& huSentenceListPretty, + const SentenceList& enSentenceList, + bool utfCharCountingMode ); + +// The function gets a nonconst reference to bestTrail. +// On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. +// Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. +void postprocessTrailStart( Trail& bestTrail, + const TrailScoresInterval& trailScoresInterval, + const double& qualityThreshold ); + +// The function gets a nonconst reference to bestTrail. +// On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. +// Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. +void postprocessTrailStartAndEnd( Trail& bestTrail, + const TrailScoresInterval& trailScoresInterval, + double qualityThreshold ); + +// The function gets a nonconst reference to bestTrail. +// On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. +// Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. +void postprocessTrail( Trail& bestTrail, + const TrailScoresInterval& trailScoresInterval, + double qualityThreshold ); + + +// Throws away rundles which are predominantly surrounded by not-one-to-one holes. +void postprocessTrailByTopology( Trail& bestTrail, double qualityThreshold ); + + +// Only collect bisentences with score at least qualityThreshold. +void trailToBisentenceList( const Trail& bestTrail, const TrailScores& trailScores, double qualityThreshold, + BisentenceList& bisentenceList ); + +// This is basically incorrect. +// Here we use the score of the right-hand segment to decide about the rundle. +// +// The function gets a nonconst reference to bestTrail. +// On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. +// Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. +void filterTrailByQuality( Trail& trail, const TrailScoresInterval& trailScoresInterval, + const double& qualityThreshold ); + +void filterBisentenceListByQuality( BisentenceList& bisentenceList, const AlignMatrix& dynMatrix, + const double& qualityThreshold ); + +} // namespace TMXAligner + +#endif // #define __TMXALIGNER_ALIGNMENT_TRAILPOSTPROCESSORS_H Index: branches/apertium-tagger/apertium2/apertium/tmx_translate.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_translate.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_translate.h (revision 69632) @@ -0,0 +1,76 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __TMXALIGNER_ALIGNMENT_TRANSLATE_H +#define __TMXALIGNER_ALIGNMENT_TRANSLATE_H + +#include +#include + +namespace TMXAligner +{ + +typedef std::map< std::string, Phrase > DumbDictionary; + +// This will become a class, with dictionary initialization, and a translate method. +// It will have various implementations. + +void buildDumbDictionary( const DictionaryItems& dictionary, DumbDictionary& dumbDictionary ); + +void buildDumbDictionaryUsingFrequencies( + const DictionaryItems& dictionary, + FrequencyMap& enFreq, + DumbDictionary& dumbDictionary ); + +void buildDumbDictionary( TMXAligner::DumbDictionary& dumbDictionary, + const std::string& dictionaryFilename, + const TMXAligner::SentenceList& enSentenceList = TMXAligner::SentenceList() + ); + +void trivialTranslateWord( + const DumbDictionary& dumbDictionary, + const Word& originalWord, + Phrase& words + ); + +void trivialTranslate( + const DumbDictionary& dumbDictionary, + const Sentence& sentence, + Sentence& translatedSentence + ); + +void trivialTranslateSentenceList( + const DumbDictionary& dumbDictionary, + const SentenceList& sentenceList, + SentenceList& translatedSentenceList + ); + +void naiveTranslate( + const DictionaryItems& dictionary, + const SentenceList& sentenceList, + SentenceList& translatedSentenceList + ); + +typedef std::multimap< std::string, Phrase > DumbMultiDictionary; + +void buildDumbMultiDictionary( const DictionaryItems& dictionary, DumbMultiDictionary& dumbMultiDictionary, bool reverse ); + +void sortNormalizeSentences( TMXAligner::SentenceList& sentenceList ); + +// This function preprocesses the sentences so that sentenceListsToAlignMatrixIdentity can be applied to them. +// It does a rough translation and an alphabetic sort of words. +void normalizeTextsForIdentity( const DictionaryItems& dictionary, + const SentenceList& huSentenceListPretty, const SentenceList& enSentenceListPretty, + SentenceList& huSentenceListGarbled, SentenceList& enSentenceListGarbled ); + +} // namespace TMXAligner + +#endif // #define __TMXALIGNER_ALIGNMENT_TRANSLATE_H Index: branches/apertium-tagger/apertium2/apertium/tmx_words.h =================================================================== --- branches/apertium-tagger/apertium2/apertium/tmx_words.h (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/tmx_words.h (revision 69632) @@ -0,0 +1,55 @@ +/************************************************************************* +* * +* (C) Copyright 2004. Media Research Centre at the * +* Sociology and Communications Department of the * +* Budapest University of Technology and Economics. * +* * +* Developed by Daniel Varga. * +* * +* From hunalign; for license see ../AUTHORS and ../COPYING.hunalign * +* * +*************************************************************************/ +#ifndef __TMXALIGNER_ALIGNMENT_WORDS_H +#define __TMXALIGNER_ALIGNMENT_WORDS_H + +#include +#include +#include + +namespace TMXAligner +{ + +typedef std::string String; + +typedef String Word; + +typedef std::vector WordList; + +typedef WordList Phrase; + +typedef std::vector Book; + +struct Sentence +{ + WordList words; + String sentence; + String id; +}; + +// Implemented in dictionary.cpp +class SentenceList : public std::vector +{ +public: + void read ( std::istream& is ); + void readNoIds( std::istream& is ); + void write( std::ostream& os ) const; + void writeNoIds( std::ostream& os ) const; +}; + +// Implemented in dictionary.cpp +void readBicorpus( std::istream& is, SentenceList& huSentenceList, SentenceList& enSentenceList); +void writeBicorpus( std::ostream& os, const SentenceList& huSentenceList, const SentenceList& enSentenceList); + +} // namespace TMXAligner + +#endif // #define __TMXALIGNER_ALIGNMENT_WORDS_H Index: branches/apertium-tagger/apertium2/apertium/html-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/html-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/html-format.xml (revision 69632) @@ -0,0 +1,378 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/html-noent-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/html-noent-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/html-noent-format.xml (revision 69632) @@ -0,0 +1,90 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/mediawiki-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/mediawiki-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/mediawiki-format.xml (revision 69632) @@ -0,0 +1,161 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/odt-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/odt-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/odt-format.xml (revision 69632) @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/pptx-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/pptx-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/pptx-format.xml (revision 69632) @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/wxml-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/wxml-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/wxml-format.xml (revision 69632) @@ -0,0 +1,72 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/xlsx-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/xlsx-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/xlsx-format.xml (revision 69632) @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/latex-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/latex-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/latex-format.xml (revision 69632) @@ -0,0 +1,268 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/rtf-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/rtf-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/rtf-format.xml (revision 69632) @@ -0,0 +1,473 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/txt-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/txt-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/txt-format.xml (revision 69632) @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/xpresstag-format.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/xpresstag-format.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/xpresstag-format.xml (revision 69632) @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file Index: branches/apertium-tagger/apertium2/apertium/apertium-filter-ambiguity.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-filter-ambiguity.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-filter-ambiguity.1 (revision 69632) @@ -0,0 +1,26 @@ +.TH apertium-filter-ambiguity 1 2006-03-21 "" "" +.SH NAME +apertium-filter-ambiguity \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-filter-ambiguity +[input_file [output_file]] +.PP +.SH DESCRIPTION +.BR apertium-filter-ambiguity +takes input from STDIN or input_file, gets tagger data, filters ambiguity +classes and outputs on STDOUT or output_file, in each case. +.PP +.SH SEE ALSO +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-deformat.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-deformat.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-deformat.1 (revision 69632) @@ -0,0 +1,49 @@ +.TH apertium-gen-deformat 1 2006-03-21 "" "" +.SH NAME +apertium-gen-deformat \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-gen-deformat +[ +.B \-a \fR| +.B \-A \fR| +.B \-m \fR| +.B \-M \fR +] +.PP +.SH DESCRIPTION +.BR apertium-gen-deformat +is a script which generates a C++ deformatter for a particular format. The +deformatter reads in a format specification file in XML and outputs a C++ deformatter +using flex. +.SH OPTIONS +.TP +.B \-a +Runs in apertium standard mode. +.TP +.B \-A, +Runs in apertium optimised mode (default) +.TP +.B \-m +Runs in matxin standard mode (matxin is another open-source machine translation system: \fBhttp://www.sourceforge.org/matxin\fR) +.TP +.B \-M +Runs in matxin optimised mode +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-gen-reformat\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-reformat.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-reformat.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-reformat.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-gen-reformat 1 2006-03-21 "" "" +.SH NAME +apertium-gen-reformat \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +architecture: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-gen-reformat +[ +.B \-O \fR +] +.PP +.SH DESCRIPTION +.BR apertium-gen-reformat +is a script which generates a C++ reformatter for a particular format. The +reformatter reads in a format file in XML and outputs a C++ reformatter +using flex. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-gen-deformat\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-multiple-translations.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-multiple-translations.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-multiple-translations.1 (revision 69632) @@ -0,0 +1,43 @@ +.TH apertium-multiple-translations 1 2006-03-08 "" "" +.SH NAME +apertium-multiple-translations \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-multiple-translations +preproc biltrans [input [output]] +.SH DESCRIPTION +.BR apertium-multiple-translations +is the program that outputs multiple translations of certain words in a text according to the +different possible translations of the words in the bilingual dictionary (in a dictionary +that supports it). The place to put this program in the modes.xml file is +just after apertium-pretransfer. +.PP +.RE +.SH FILES +These are the four files that can be used with this command: +.B preproc +Result of preprocess trules file +.PP +.B biltrans +Bilingual letter transducer file +.PP +.B infile +Input file (stdin by default). +.PP +.B outfile +Output file (stdout by default). +.PP +.SH SEE ALSO +.I apertium-transfer\fR(1), +.I apertium \fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005--2008 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-postlatex-raw.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-postlatex-raw.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-postlatex-raw.1 (revision 69632) @@ -0,0 +1,37 @@ +.TH apertium-postlatex-raw 1 2012-02-29 "" "" +.SH NAME +apertium-postlatex \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-postlatex-raw +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-postlatex-raw +This filter generates LaTeX code from the output of apertium-relatex +command. Non-ASCII characters are generated into its native encoding, +depending on the locale of the running environment. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-deslatex\fR(1), +.I apertium-prelatex\fR(1), +.I apertium-relatex\fR(1), +.I apertium-postlatex-raw\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Complicated constructions in LaTeX (i.e. custom defined tags) are not (yet) +supported. +.PP +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-postlatex.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-postlatex.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-postlatex.1 (revision 69632) @@ -0,0 +1,37 @@ +.TH apertium-postlatex 1 2012-02-29 "" "" +.SH NAME +apertium-postlatex \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-postlatex +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-postlatex +This filter generates LaTeX code from the output of apertium-relatex +command. Non-ASCII characters are transformed to ASCII-compatible LaTeX construction +rather than natively-encoded characters. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-deslatex\fR(1), +.I apertium-prelatex\fR(1), +.I apertium-relatex\fR(1), +.I apertium-postlatex\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Complicated constructions in LaTeX (i.e. custom defined tags) are not (yet) +supported. +.PP +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-prelatex.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-prelatex.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-prelatex.1 (revision 69632) @@ -0,0 +1,37 @@ +.TH apertium-prelatex 1 2012-02-29 "" "" +.SH NAME +apertium-prelatex \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-prelatex +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-prelatex +This filter preprocess LaTeX as input transforming it into a deformatted 'XMLish' +LaTeX custom format. The output suitable for preprocess with +apertium-deslatex deformatter. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-deslatex\fR(1), +.I apertium-postlatex\fR(1), +.I apertium-relatex\fR(1), +.I apertium-postlatex-raw\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Complicated constructions in LaTeX (i.e. custom defined tags) are not (yet) +supported. +.PP +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-preprocess-transfer.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-preprocess-transfer.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-preprocess-transfer.1 (revision 69632) @@ -0,0 +1,37 @@ +.TH apertium-preprocess-transfer 1 2006-03-08 "" "" +.SH NAME +apertium-preprocess-transfer \- This application is part of ( +.B apertium +) +.PP +This tool is part of the open-source apertium machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-preprocess-transfer +rules_file transfer_file +.SH DESCRIPTION +.BR apertium-preprocess-transfer +is a structural transfer preprocessor which reads in a structural transfer +rule file and generates a file with precompiled patterns and indexes to the +actions of the rules of the structural transfer module specification. +.PP +.RE +.SH FILES +These are the two files that can be used with this command: +.PP +.B rules_file +File with structural transfer rules +.PP +.B transfer_file +File with precompiled patterns +.PP +.SH SEE ALSO +.I apertium\fR(1), +.I apertium-transfer\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-rehtml.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-rehtml.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-rehtml.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-rehtml 1 2006-03-21 "" "" +.SH NAME +apertium-rehtml \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-rehtml +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-rehtml +is an HTML format processor. It restores the original HTML formatting +the text had before being passed through the apertium-deshtml deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-retxt\fR(1), +.I apertium-rertf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-relatex.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-relatex.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-relatex.1 (revision 69632) @@ -0,0 +1,37 @@ +.TH apertium-relatex 1 2012-02-29 "" "" +.SH NAME +apertium-relatex \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-relatex +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-relatex +This filter preprocess apertium generator output and removes superblanks +marks to do subsequent processing in apertium pipeline. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-prelatex\fR(1), +.I apertium-postlatex\fR(1), +.I apertium-desatex\fR(1), +.I apertium-postlatex-raw\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Complicated constructions in LaTeX (i.e. custom defined tags) are not (yet) +supported. +.PP +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-remediawiki.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-remediawiki.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-remediawiki.1 (revision 69632) @@ -0,0 +1,35 @@ +.TH apertium-retxt 1 2006-03-21 "" "" +.SH NAME +apertium-remediawiki \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-retxt +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-retxt +is a mediawiki format processor. It restores the original formatting +the text had (newlines, tabs, etc.) before being passed through the apertium-desmediawiki deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-retxt\fR(1), +.I apertium-rehtml\fR(1), +.I apertium-rertf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-reodt.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-reodt.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-reodt.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-reodt 1 2006-03-21 "" "" +.SH NAME +apertium-reodt \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-reodt +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-reodt +is an ODT format processor. It restores the original ODT formatting +the text had before being passed through the apertium-desodt deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-retxt\fR(1), +.I apertium-rertf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-repptx.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-repptx.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-repptx.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-repptx 1 2006-03-21 "" "" +.SH NAME +apertium-repptx \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-repptx +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-repptx +is an PPTXX format processor. It restores the original PPTX formatting +the text had before being passed through the apertium-despptx deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-retxt\fR(1), +.I apertium-rertf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-rertf-cp1250.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-rertf-cp1250.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-rertf-cp1250.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-rertf 1 2006-03-21 "" "" +.SH NAME +apertium-rertf \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-rertf +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-rertf +is an RTF format processor. It restores the original RTF formatting +the text had before being passed through the apertium-desrtf deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-retxt\fR(1), +.I apertium-rehtml\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-rertf-cp1251.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-rertf-cp1251.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-rertf-cp1251.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-rertf 1 2006-03-21 "" "" +.SH NAME +apertium-rertf \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-rertf +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-rertf +is an RTF format processor. It restores the original RTF formatting +the text had before being passed through the apertium-desrtf deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-retxt\fR(1), +.I apertium-rehtml\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-rertf.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-rertf.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-rertf.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-rertf 1 2006-03-21 "" "" +.SH NAME +apertium-rertf \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-rertf +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-rertf +is an RTF format processor. It restores the original RTF formatting +the text had before being passed through the apertium-desrtf deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-retxt\fR(1), +.I apertium-rehtml\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-retxt.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-retxt.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-retxt.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-retxt 1 2006-03-21 "" "" +.SH NAME +apertium-retxt \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-retxt +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-retxt +is a text format processor. It restores the original formatting +the text had (newlines, tabs, etc.) before being passed through the apertium-destxt deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-rehtml\fR(1), +.I apertium-rertf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-rewxml.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-rewxml.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-rewxml.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-rewxml 1 2006-03-21 "" "" +.SH NAME +apertium-rewxml \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-rewxml +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-rewxml +is an WXML format processor. It restores the original WXML formatting +the text had before being passed through the apertium-deswxml deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-retxt\fR(1), +.I apertium-rertf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-rexlsx.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-rexlsx.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-rexlsx.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium-rexlsx 1 2006-03-21 "" "" +.SH NAME +apertium-rexlsx \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-rexlsx +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-rexlsx +is an XLSX format processor. It restores the original XLSX formatting +the text had before being passed through the apertium-desxlsx deformatter. + +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-retxt\fR(1), +.I apertium-rertf\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-utils-fixlatex.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-utils-fixlatex.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-utils-fixlatex.1 (revision 69632) @@ -0,0 +1,38 @@ +.TH apertium-utils-fixlatex 1 2012-02-29 "" "" +.SH NAME +apertium-utils-fixlatex \- This application is part of ( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-utils-fixlatex +[ [ ] ] +.PP +.SH DESCRIPTION +.BR apertium-utils-fixlatex +gawk-based script to fix some constructions in 'XMLish' apertium LaTeX +format to get better translation results. +.SH OPTIONS +.TP +.B \-h, \-\-help +Display this help. +.PP +.SH SEE ALSO +.I apertium-destxt\fR(1), +.I apertium-prelatex\fR(1), +.I apertium-relatex\fR(1), +.I apertium-postlatex\fR(1), +.I apertium-desatex\fR(1), +.I apertium-postlatex-raw\fR(1), +.I lt-proc\fR(1), +.I apertium\fR(1). +.SH BUGS +Complicated constructions in LaTeX (i.e. custom defined tags) are not (yet) +supported. +.PP +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-unformat-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-unformat-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-unformat-header.sh (revision 69632) @@ -0,0 +1,229 @@ +PAIR="" +INPUT_FILE="/dev/stdin" +OUTPUT_FILE="/dev/stdout" + +[ -z "$TMPDIR" ] && TMPDIR=/tmp + + +message () +{ + echo "USAGE: $(basename $0) [-f format] [in [out]]" + echo " -f format one of: txt (default), html, rtf, odt, docx, wxml, xlsx, pptx" + echo " in input file (stdin by default)" + echo " out output file (stdout by default)" + exit 1; +} + +locale_utf8 () +{ + export LC_CTYPE=$(locale -a|grep -i "utf[.]*8"|head -1); + if [ "$LC_CTYPE" = "" ] + then echo "Error: Install an UTF-8 locale in your system"; + exit 1; + fi +} + +test_zip () +{ + if [ "$(which zip)" = "" ] + then echo "Error: Install 'zip' command in your system"; + exit 1; + fi + + if [ "$(which unzip)" = "" ] + then echo "Error: Install 'unzip' command in your system"; + exit 1; + fi +} + +test_gawk () +{ + GAWK=$(which gawk) + if [ "$GAWK" = "" ] + then echo "Error: Install 'gawk' in your system" + exit 1 + fi +} + + +unformat_latex() +{ + test_gawk + + if [ "$FICHERO" = "" ] + then FICHERO=$(mktemp $TMPDIR/apertium.XXXXXXXX) + cat > $FICHERO + BORRAFICHERO="true" + fi + + $APERTIUM_PATH/apertium-prelatex $FICHERO | \ + $APERTIUM_PATH/apertium-utils-fixlatex | \ + $APERTIUM_PATH/apertium-deslatex >$SALIDA + + if [ "$BORRAFICHERO" = "true" ] + then rm -Rf $FICHERO + fi +} + + +unformat_odt () +{ + INPUT_TMPDIR=$(mktemp -d $TMPDIR/apertium.XXXXXXXX) + + locale_utf8 + test_zip + + unzip -q -o -d $INPUT_TMPDIR $FICHERO + find $INPUT_TMPDIR | grep content\\\.xml |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + $APERTIUM_PATH/apertium-desodt >$SALIDA + rm -Rf $INPUT_TMPDIR +} + +unformat_docx () +{ + INPUT_TMPDIR=$(mktemp -d $TMPDIR/apertium.XXXXXXXX) + + locale_utf8 + test_zip + + unzip -q -o -d $INPUT_TMPDIR $FICHERO + + for i in $(find $INPUT_TMPDIR|grep "xlsx$"); + do LOCALTEMP=$(mktemp $TMPDIR/apertium.XXXXXXXX) + $APERTIUM_PATH/apertium -f xlsx -d $DIRECTORY $OPCIONU $PREFIJO <$i >$LOCALTEMP; + cp $LOCALTEMP $i; + rm $LOCALTEMP; + done; + + find $INPUT_TMPDIR | grep "xml" |\ + grep -v -i \\\(settings\\\|theme\\\|styles\\\|font\\\|rels\\\|docProps\\\) |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + $APERTIUM_PATH/apertium-deswxml >$SALIDA + rm -Rf $INPUT_TMPDIR +} + +unformat_pptx () +{ + INPUT_TMPDIR=$(mktemp -d $TMPDIR/apertium.XXXXXXXX) + + locale_utf8 + test_zip + + unzip -q -o -d $INPUT_TMPDIR $FICHERO + + for i in $(find $INPUT_TMPDIR|grep "xlsx$"); + do LOCALTEMP=$(mktemp $TMPDIR/apertium.XXXXXXXX) + $APERTIUM_PATH/apertium -f xlsx -d $DIRECTORY $OPCIONU $PREFIJO <$i >$LOCALTEMP + cp $LOCALTEMP $i + rm $LOCALTEMP + done; + + find $INPUT_TMPDIR | grep "xml$" |\ + grep "slides\/slide" |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + $APERTIUM_PATH/apertium-despptx >$SALIDA + rm -Rf $INPUT_TMPDIR +} + + +unformat_xlsx () +{ + INPUT_TMPDIR=$(mktemp -d $TMPDIR/apertium.XXXXXXXX) + + locale_utf8 + test_zip + + unzip -q -o -d $INPUT_TMPDIR $FICHERO + find $INPUT_TMPDIR | grep "sharedStrings.xml" |\ + awk '{printf ""; PART = $0; while(getline < PART) printf(" %s", $0); printf("\n");}' |\ + $APERTIUM_PATH/apertium-desxlsx >$SALIDA + rm -Rf $INPUT_TMPDIR + +} + + +ARGS=$(getopt "f:" $*) +set -- $ARGS +for i +do + case "$i" in + -f) shift; FORMAT=$1; shift;; + --) shift; break;; + esac +done + +case "$#" in + 2) + OUTPUT_FILE=$2; + INPUT_FILE=$1; + if [ ! -e $INPUT_FILE ]; + then echo "Error: file '$INPUT_FILE' not found." + message; + fi + ;; + 1) + INPUT_FILE=$1; + if [ ! -e $INPUT_FILE ]; + then echo "Error: file '$INPUT_FILE' not found." + message; + fi + ;; + 0) + ;; + *) + message + ;; +esac + +if [ x$FORMAT = x ]; then FORMAT="txt"; fi + +FORMATADOR=$FORMAT; +FICHERO=$INPUT_FILE; +SALIDA=$OUTPUT_FILE; + + +case "$FORMATADOR" in + rtf) + MILOCALE=$(locale -a|grep -i -v "utf\|^C$\|^POSIX$"|head -1); + if [ "$MILOCALE" = "" ] + then echo "Error: Install a ISO-8859-1 compatible locale in your system"; + exit 1; + fi + export LC_CTYPE=$MILOCALE + ;; + html-noent) + FORMATADOR="html" + ;; + + latex) + unformat_latex + exit 0 + ;; + + odt) + unformat_odt + exit 0 + ;; + docx) + unformat_docx + exit 0 + ;; + xlsx) + unformat_xlsx + exit 0 + ;; + pptx) + unformat_pptx + exit 0 + ;; + + wxml) + locale_utf8 + ;; + *) + ;; + +esac + +$APERTIUM_PATH/apertium-des$FORMATADOR $FICHERO >$SALIDA Index: branches/apertium-tagger/apertium2/apertium/utils-fixlatex-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/utils-fixlatex-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/utils-fixlatex-header.sh (revision 69632) @@ -0,0 +1,52 @@ +INPUT_FILE=/dev/stdin +OUTPUT_FILE=/dev/stdout + +cat $INPUT_FILE | \ +gawk ' +function is_inline_tag(str, aux, val) +{ + for(val in INLINETAGS) + { + aux = INLINETAGS[val] ""; + if(gsub(aux, aux, str) == 1) + { + return 1; + } + } + + return 0; +} + +BEGIN{ + RS=""; + + INLINETAGS[1]=""; + INLINETAGS[2]=""; + INLINETAGS[3]=""; +} +{ + MYRECORD[++nline] = $0; +} +END{ + for(i=1; i < nline; i++) + { + if(gsub("", "", MYRECORD[i]) == 1) + { + if(is_inline_tag(MYRECORD[i])) + { + printf("%s", MYRECORD[i]); + } + else + { + printf("%s", MYRECORD[i]); + } + } + else + { + printf("%s", MYRECORD[i]); + } + } + + printf("%s", MYRECORD[nline]); +}' > $OUTPUT_FILE + Index: branches/apertium-tagger/apertium2/apertium/apertium-interchunk.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-interchunk.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-interchunk.1 (revision 69632) @@ -0,0 +1,54 @@ +.TH apertium\-interchunk 1 2007-03-11 "" "" +.SH NAME +apertium\-interchunk \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-interchunk +[\-tz] trules preproc [input [output]] +.SH DESCRIPTION +This is an intermediate tool from Apertium level 2 engine. You should +never have to use it independently. +.PP +It is the second transfer module of the Apertium level 2 transfer model after +\fIapertium-transfer\fR and before \fIapertium-postchunk\fR. +.PP +It takes care of interchunk processing operations such as chunk +reordering, changes in the morphosyntactical features of chunks +according to the information in neighboring chunks, or generating new +chunks. +.SH OPTIONS + \-t trace mode + \-z flush buffer on the null character +.SH FILES +These are the kinds of files that can be used with this command: +.PP +.B trules +A rules file with extension \fI.t2x\fR. +.PP +.B preproc +A file with extension \fI.t2x.bin\fR that holds the result of +preprocessing the \fItrules\fR file with +\fIapertium-preprocess-transfer\fR. +.PP +.B input, output +Represent the input and output files. By default they are the standard +input and standard output. +.SH SEE ALSO +.I apertium\-gen\-modes\fR(1), +.I apertium\-postchunk\fR(1), +.I apertium\-transfer\fR(1), +.I apertium\-validate\-interchunk\fR(1), +.I apertium\-validate\-modes\fR(1), +.I apertium\-validate\-postchunk\fR(1). +.SH BUGS +Lots of them...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005-2007 Universitat d'Alacant / Universidad de +Alicante. This is free software. You may +redistribute copies of it under the terms of the GNU General Public +License . Index: branches/apertium-tagger/apertium2/apertium/apertium-postchunk.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-postchunk.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-postchunk.1 (revision 69632) @@ -0,0 +1,52 @@ +.TH apertium\-postchunk 1 2007-03-11 "" "" +.SH NAME +apertium\-postchunk \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-postchunk +[\-z] trules preproc [input [output]] +.SH DESCRIPTION +This is an intermediate tool from Apertium level 2 engine. You should +never have to use it independently. +.PP +It is the third transfer module of the Apertium level 2 transfer model +after \fIapertium-transfer\fR and \fIapertium-interchunk\fR. +.PP +It generates lexical forms from the chunks generated by +apertium-interchunk by effecting some finishing changes in their +morphological information. +.SH OPTIONS +\-z flush buffer on the null character +.SH FILES +These are the kinds of files that can be used with this command: +.PP +.B trules +A rules file with extension \fI.t3x\fR. +.PP +.B preproc +A file with extension \fI.t3x.bin\fR that holds the result of +preprocessing the \fItrules\fR file with +\fIapertium-preprocess-transfer\fR. +.PP +.B input, output +Represent the input and output files. By default they are the standard +input and standard output. +.SH SEE ALSO +.I apertium\-gen\-modes\fR(1), +.I apertium\-interchunk\fR(1), +.I apertium\-validate\-postchunk\fR(1), +.I apertium\-validate\-interchunk\fR(1), +.I apertium\-validate\-modes\fR(1), +.I apertium\-transfer\fR(1). +.SH BUGS +Lots of them...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005-2007 Universitat d'Alacant / Universidad de +Alicante. This is free software. You may +redistribute copies of it under the terms of the GNU General Public +License . Index: branches/apertium-tagger/apertium2/apertium/apertium_config.h.cmake_in =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium_config.h.cmake_in (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium_config.h.cmake_in (revision 69632) @@ -0,0 +1,57 @@ +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "${PACKAGE_BUGREPORT}" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "${PACKAGE_NAME}" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "${PACKAGE_STRING}" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "${PACKAGE_TARNAME}" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "${PACKAGE_VERSION}" + +#define HAVE_DECL_FPUTS_UNLOCKED ${HAVE_DECL_FPUTS_UNLOCKED} +#define HAVE_DECL_FGETC_UNLOCKED ${HAVE_DECL_FGETC_UNLOCKED} +#define HAVE_DECL_FPUTC_UNLOCKED ${HAVE_DECL_FPUTC_UNLOCKED} +#define HAVE_DECL_FWRITE_UNLOCKED ${HAVE_DECL_FWRITE_UNLOCKED} +#define HAVE_DECL_FREAD_UNLOCKED ${HAVE_DECL_FREAD_UNLOCKED} +#define HAVE_DECL_FGETWC_UNLOCKED ${HAVE_DECL_FGETWC_UNLOCKED} +#define HAVE_DECL_FPUTWC_UNLOCKED ${HAVE_DECL_FPUTWC_UNLOCKED} +#define HAVE_DECL_FPUTWS_UNLOCKED ${HAVE_DECL_FPUTWS_UNLOCKED} + +#define HAVE_DECL_FPUTS_NOLOCK ${HAVE_DECL_FPUTS_NOLOCK} +#define HAVE_DECL_FGETC_NOLOCK ${HAVE_DECL_FGETC_NOLOCK} +#define HAVE_DECL_FPUTC_NOLOCK ${HAVE_DECL_FPUTC_NOLOCK} +#define HAVE_DECL_FWRITE_NOLOCK ${HAVE_DECL_FWRITE_NOLOCK} +#define HAVE_DECL_FREAD_NOLOCK ${HAVE_DECL_FREAD_NOLOCK} +#define HAVE_DECL_FGETWC_NOLOCK ${HAVE_DECL_FGETWC_NOLOCK} +#define HAVE_DECL_FPUTWC_NOLOCK ${HAVE_DECL_FPUTWC_NOLOCK} +#define HAVE_DECL_FPUTWS_NOLOCK ${HAVE_DECL_FPUTWS_NOLOCK} + +#if !defined(HAVE_DECL_FPUTS_UNLOCKED) && defined (HAVE_DECL_FPUTS_NOLOCK) +#define fputs_unlocked _fputs_nolock +#endif +#if !defined(HAVE_DECL_FGETC_UNLOCKED) && defined (HAVE_DECL_FGETC_NOLOCK) +#define fgetc_unlocked _fgetc_nolock +#endif +#if !defined(HAVE_DECL_FPUTC_UNLOCKED) && defined (HAVE_DECL_FPUTC_NOLOCK) +#define fputc_unlocked _fputc_nolock +#endif +#if !defined(HAVE_DECL_FWRITE_UNLOCKED) && defined (HAVE_DECL_FWRITE_NOLOCK) +#define fwrite_unlocked _fwrite_nolock +#endif +#if !defined(HAVE_DECL_FREAD_UNLOCKED) && defined (HAVE_DECL_FREAD_NOLOCK) +#define fread_unlocked _fread_nolock +#endif +#if !defined(HAVE_DECL_FPUTWS_UNLOCKED) && defined (HAVE_DECL_FPUTWS_NOLOCK) +#define fputws_unlocked _fputws_nolock +#endif +#if !defined(HAVE_DECL_FGETWC_UNLOCKED) && defined (HAVE_DECL_FGETWC_NOLOCK) +#define fgetwc_unlocked _fgetwc_nolock +#endif +#if !defined(HAVE_DECL_FPUTWC_UNLOCKED) && defined (HAVE_DECL_FPUTWC_NOLOCK) +#define fputwc_unlocked _fputwc_nolock +#endif Index: branches/apertium-tagger/apertium2/apertium/rtf-format-cp1250.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/rtf-format-cp1250.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/rtf-format-cp1250.xml (revision 69632) @@ -0,0 +1,532 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/rtf-format-cp1251.xml =================================================================== --- branches/apertium-tagger/apertium2/apertium/rtf-format-cp1251.xml (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/rtf-format-cp1251.xml (revision 69632) @@ -0,0 +1,576 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index: branches/apertium-tagger/apertium2/apertium/apertium-lextor.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-lextor.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-lextor.1 (revision 69632) @@ -0,0 +1,132 @@ +.TH apertium-lextor 1 2006-12-12 "" "" +.SH NAME +apertium-lextor \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium-lextor +.B \-\-trainwrd\fR stopwords words n left right corpus model +[ +.B \-\-weightexp\fR w +] +[ +.B \-\-debug\fR +] +.PP +.B apertium\-lextor +.B \-\-trainlch\fR stopwords lexchoices n left right corpus wordmodel dic bildic model +[ +.B \-\-weightexp\fR w +] +[ +.B \-\-debug\fR +] +.PP +.B apertium\-lextor +.B \-\-lextor\fR model dic left right +[ +.B \-\-debug\fR +] +[ +.B \-\-weightexp\fR w +] +.PP +.SH DESCRIPTION +.BR apertium\-lextor +is the application responsible for training and usage of the \fIlexical +selector module\fR. +.SH OPTIONS +.PP +.B \-\-trainwrd | \-t\fR +.br +Train word co-occurrences model. It needs the following required parameters: +.TP +.I stopwords\fR file containing a list of stop words. Stop words are ignored. +.TP +.I words\fR file containing a list of words. For each word a co-occurrence model is built. +.TP +.I n\fR number of words per co\-occurrence model (for each model, the \fIn\fR most frequent words). +.TP +.I left\fR left\-side context to take into account (number of words). +.TP +.I right\fR right\-side context to take into account (number of words). +.TP +.I corpus\fR file containing the training corpus. +.TP +.I model\fR output file on which the co\-occurrence models are saved. +.PP +.B \-\-trainlch | \-r\fR +.br +Train lexical choices co\-occurrence models using a target language +co\-occurrence model and a bilingual dictionary. It needs the +following required parameters: +.TP +.I stopwords\fR file containing a list of stop words. Stop words are ignored. +.TP +.I lexchoices\fR file containing a list of lexical choices. For each lexical choice a co\-occurrence model is built. +.TP +.I n\fR number of words per co\-occurrence model (for each model, the n most frequent words). +.TP +.I left\fR left\-side context to take into account (number of words). +.TP +.I right\fR right\-side context to take into account (number of words). +.TP +.I corpus\fR file containing the training corpus. +.TP +.I wordmodel\fR target\-language word co\-occurrence model (previously trained by means of the \fB\-\-trainwrd\fR option). +.TP +.I dic\fR the lexical-selection dictionary (binary format). +.TP +.I bildic\fR the bilingual dictionary (binary format). +.TP +.I model\fR output file on which the co\-occurrence models are saved. +.PP +.PP +.B \-\-lextor | \-l +.br +Perform the lexical selection on the input stream. It needs the +following required parameters: +.TP +.I model\fR file containing the model to be used for the lexical selection. +.TP +.I dic\fR lexical\-selection dictionary (binary format). +.TP +.I left\fR left\-side context to take into account (number of words). +.TP +.I right\fR right\-side context to take into account (number of words). +.PP +.B \-\-weightexp w +.br +Specify a weight value to change the influence of surrounding words +while training or performing the lexical selection. The parameter +\fIw\fR must be a positive value. +.PP +.B \-\-debug | \-d +.br +Show debug information while working. +.PP +.B \-\-help | \-h +.br +Shows this help. +.PP +.B \-\-version | \-v +.br +Shows license information. +.PP +.SH SEE ALSO +.I apertium\-gen\-lextorbil\fR(1), +.I apertium\-preprocess\-corpus\-lextor\fR(1), +.I apertium\-gen\-stopwords\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\-translation\fR(1), +.I apertium\-lextor\-eval\fR(1), +.I apertium\-gen\-lextormono\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-lextorbil.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-lextorbil.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-lextorbil.1 (revision 69632) @@ -0,0 +1,48 @@ +.TH apertium-gen-lextorbil 1 2006-12-11 "" "" +.SH NAME +apertium-gen-lextorbil \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-gen\-lextorbil +.I lr\fR\ | +.I rl\fR +input_file output_file +.PP +.SH DESCRIPTION +.BR apertium\-gen\-lextorbil +is the application responsible for generating the bilingual dictionary +used by the transfer module when apertium\-lextor is being used to +perform lexical selection. +.SH OPTIONS +.TP +.B lr\fR The bilingual dictionary to generate is for left to right translation. +.PP +.B rl\fR The bilingual dictionary to generate is for right to left translation. +.SH FILES +These are the kinds of files used with this tool: +.PP +.B input_file +A bilingual dictionary. +.PP +.B output_file +A bilingual dictionary in which each word has \fIonly\fR one +translation equivalent. +.PP +.SH SEE ALSO +.I apertium\-gen\-lextormono\fR(1), +.I apertium\-preprocess\-corpus\-lextor\fR(1), +.I apertium\-gen\-stopwords\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\-translation\fR(1), +.I apertium\-lextor\fR(1), +.I apertium\-lextor\-eval\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-lextormono.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-lextormono.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-lextormono.1 (revision 69632) @@ -0,0 +1,48 @@ +.TH apertium-gen-lextormono 1 2006-12-11 "" "" +.SH NAME +apertium-gen-lextormono \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-gen\-lextormono +.I lr\fR\ | +.I rl\fR +input_file output_file +.PP +.SH DESCRIPTION +.BR apertium\-gen\-lextormono +is the application responsible for generating the monolingual +dictionary used by the lexical selection module to know about the +translation sense marks of each source language word. +.SH OPTIONS +.TP +.B lr\fR The monolingual dictionary to generate is for left to right translation. +.PP +.B rl\fR The monolingual dictionary to generate is for right to left translation. +.SH FILES +These are the kinds of files used with this tool: +.PP +.B input_file +A bilingual dictionary. +.PP +.B output_file +A monolingual dictionary that for each word gives its translation +sense marks. +.PP +.SH SEE ALSO +.I apertium\-gen\-lextorbil\fR(1), +.I apertium\-preprocess\-corpus\-lextor\fR(1), +.I apertium\-gen\-stopwords\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\-translation\fR(1), +.I apertium\-lextor\-eval\fR(1), +.I apertium\-lextor\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-modes.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-modes.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-modes.1 (revision 69632) @@ -0,0 +1,43 @@ +.TH apertium\-gen\-modes 1 2007-03-11 "" "" +.SH NAME +apertium\-gen\-modes \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-gen\-modes +modes-file [package name] +.SH DESCRIPTION +This is an intermediate tool from Apertium level 2 engine. You should +never have to use it independently. +.PP +It creates a series of \fI.mode\fR files from a \fImodes-file\fR +these files are then put into the current directory if they are marked +as \fIinstall\fR, and for modes that are not to be installed, for +example debugging modes, these are put into a \fImodes\fR directory +inside the linguistic data. +.SH OPTIONS +If a package name is specified, it creates the modes with the apertium +install prefix plus the package name. If you wish to install the modes, +you should specify the package name. If you don't want to install the modes, +leave it off. +.SH FILES +.B modes-file +A XML file that tells \fBapertium\-gen\-modes\fR which scripts must +be created in the directory \fImodes\fR. +.SH SEE ALSO +.I apertium\-interchunk\fR(1), +.I apertium\-postchunk\fR(1), +.I apertium\-validate\-interchunk\fR(1), +.I apertium\-validate\-modes\fR(1), +.I apertium\-validate\-postchunk\fR(1). +.SH BUGS +Lots of them...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005-2007 Universitat d'Alacant / Universidad de +Alicante. This is free software. You may +redistribute copies of it under the terms of the GNU General Public +License . Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-stopwords-lextor.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-stopwords-lextor.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-stopwords-lextor.1 (revision 69632) @@ -0,0 +1,45 @@ +.TH apertium-gen-stopwords-lextor 1 2006-12-12 "" "" +.SH NAME +apertium-gen-stopwords-lextor \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-gen\-stopwords\-lextor +n input_file output_file +.PP +.SH DESCRIPTION +.BR apertium\-gen\-stopwords\-lextor +is the application responsible for generating the list of +\fIstopwords\fR used by the lexical selection module +(apertium\-lextor). Stopwords are ignored as they cannot have multiple +translations. +.SH OPTIONS +.B n +the desired number of stopwords. +.SH FILES +These are the kinds of parameters and files used with this tool: +.PP +.B input_file +contains a large preprocessed corpus (see +apertium\-preprocess\-corpus\-lextor). +.PP +.B output_file +The file which gets the generated stopwords. +.PP +.SH SEE ALSO +.I apertium\-gen\-lextorbil\fR(1), +.I apertium\-gen\-lextormono\fR(1), +.I apertium\-preprocess\-corpus\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\-translation\fR(1), +.I apertium\-lextor\-eval\fR(1), +.I apertium\-lextor\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-wlist-lextor-translation.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-wlist-lextor-translation.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-wlist-lextor-translation.1 (revision 69632) @@ -0,0 +1,56 @@ +.TH apertium-gen-wlist-lextor-translation 1 2006-12-12 "" "" +.SH NAME +apertium-gen-wlist-lextor-translation \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-gen\-wlist\-lextor\-translation +.B \-\-mono|\-m\fR dic.bin +.B \-\-bil|\-b\fR bildic.bin +.B \-\-wlist|\-w\fR wlistfile +.PP +.SH DESCRIPTION +.BR apertium\-gen\-wlist\-lextor\-translation +is the application responsible for generating all the possible +translations of polysemous words. +.SH OPTIONS +.TP +.B \-\-mono|\-m\fR dic.bin +.TP + Specifies the monolingual lexical selection dictionary to use (see apertium\-gen\-lextormono). +.TP +.B \-\-bil|\-b\fR bildic.bin +.TP +Specifies the bilingual lexical selection ditionary to use (see apertium\-gen\-lextorbil). +.TP +.B \-\-wlist|-w\fR wlistfile +.TP +Specifies the list of words to translate (see apertium\-gen\-wlist\-lextor). +.TP +.B \-\-help|\-h\fR +.TP +Shows a brief usage help. +.TP +.B \-\-version|\-v\fR +.TP +Shows the version string of this tool and it's license. +.SH FILES +This tool uses no files apart from the ones associated to each option. +.PP +.SH SEE ALSO +.I apertium\-gen\-lextorbil\fR(1), +.I apertium\-preprocess\-corpus\-lextor\fR(1), +.I apertium\-gen\-stopwords\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\fR(1), +.I apertium\-gen\-lextormono\fR(1), +.I apertium\-lextor\-eval\fR(1), +.I apertium\-lextor\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-gen-wlist-lextor.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-gen-wlist-lextor.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-gen-wlist-lextor.1 (revision 69632) @@ -0,0 +1,42 @@ +.TH apertium-gen-wlist-lextor 1 2006-12-12 "" "" +.SH NAME +apertium-gen-wlist-lextor \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-gen\-wlist\-lextor +input_file output_file +.PP +.SH DESCRIPTION +.BR apertium\-gen\-wlist\-lextor +is the application responsible for generating the list of words used by +apertium\-lextor. +.SH OPTIONS +This tool currently has no options. +.SH FILES +These are the kinds of files used with this tool: +.PP +.B input_file +is a lextor monolingual dictionary file generated with +\fIapertium\-gen\-lextormono\fR. These files usually have the extension \fI.dix\fR. +.PP +.B output_file +The file which gets the generated list of words. +.PP +.SH SEE ALSO +.I apertium\-gen\-lextorbil\fR(1), +.I apertium\-gen\-lextormono\fR(1), +.I apertium\-preprocess\-corpus\-lextor\fR(1), +.I apertium\-gen\-stopwords\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\-translation\fR(1), +.I apertium\-lextor\-eval\fR(1), +.I apertium\-lextor\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-lextor-eval.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-lextor-eval.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-lextor-eval.1 (revision 69632) @@ -0,0 +1,62 @@ +.TH apertium-lextor-eval 1 2006-12-12 "" "" +.SH NAME +apertium-lextor-eval \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-lextor\-eval +.B \-\-reference\fR reftext +.B \-\-parameters\fR | \fB\-p\fR model dic left right +.PP +.SH DESCRIPTION +.BR apertium\-lextor\-eval +is the application used to evaluate the performance of lexical +selection models that have been previously estimated with +\fBapertium\-lextor\fR(1). To achieve this purpose a manually +disambiguated corpus is used. This corpus is read from the standard +input and must be in the intermediate format used by apertium. +.SH OPTIONS +.TP +.B \-\-reference | \-r\fR +.br +Specifies the reference corpus used for evaluation (one word per line +with the correct translation sense for those words with more than +one). +.TP +.B \-\-parameters | \-p\fR +.br +It allows to specify the following required parameters: +.TP +.I model\fR the file containing the model to be used for the lexical selection. +.TP +.I dic\fR the lexical\-selection dictionary in binary format. +.TP +.I left\fR left\-side context to take into account (number of words). +.TP +.I right\fR right\-side context to take into account (number of words). +.PP +.B \-\-help | \-h +.br +Shows this help. +.PP +.B \-\-version | \-v +.br +Shows license information. +.PP +.SH SEE ALSO +.I apertium\-gen\-lextorbil\fR(1), +.I apertium\-preprocess\-corpus\-lextor\fR(1), +.I apertium\-gen\-stopwords\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\-translation\fR(1), +.I apertium\-lextor\-mono\fR(1), +.I apertium\-lextor\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-preprocess-corpus-lextor.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-preprocess-corpus-lextor.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-preprocess-corpus-lextor.1 (revision 69632) @@ -0,0 +1,48 @@ +.TH apertium-preprocess-corpus-lextor 1 2006-12-12 "" "" +.SH NAME +apertium-preprocess-corpus-lextor \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://apertium.org\fR. +.SH SYNOPSIS +.B apertium\-preprocess\-corpus\-lextor +data_dir translation_dir input_file output_file +.PP +.SH DESCRIPTION +.BR apertium\-preprocess\-corpus\-lextor +is the application responsible for preprocessing the training corpus +for the lexical selector training. +.SH OPTIONS +This tool currently has no options. +.SH FILES +These are the kinds of files and directories used with this tool: +.PP +.B data_dir +the path to the linguistic data to use. +.PP +.B translation_dir +the translation direction to use. +.PP +.B input_file +contains a large corpus in +.I raw\fR format. +.PP +.B output_file +The file which gets the preprocessed corpus. +.PP +.SH SEE ALSO +.I apertium\-gen\-lextorbil\fR(1), +.I apertium\-gen\-lextormono\fR(1), +.I apertium\-gen\-lextor\-eval\fR(1), +.I apertium\-gen\-stopwords\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\fR(1), +.I apertium\-gen\-wlist\-lextor\-translation\fR(1), +.I apertium\-lextor\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2005,2006 Universitat d'Alacant / Universidad de Alicante. All rights +reserved. Index: branches/apertium-tagger/apertium2/apertium/apertium-validate-acx.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-validate-acx.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-validate-acx.1 (revision 69632) @@ -0,0 +1,40 @@ +.TH apertium\-validate\-acx 1 2006\-03\-11 "" "" +.SH NAME +apertium\-validate\-acx \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium open\-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium\-validate\-acx + +.SH DESCRIPTION +This is an intermediate tool from Apertium level 2 engine. You should +never have to use it independently. +.PP +It is a script that validates a set of structural acx rules +against the apertium structural acx rules RNG using the +\fBxmllint\fR utility. +.SH OPTIONS +It has no options. +.SH FILES +.B input_file +A \fIacx.xml\fR file +.PP +.B acx.rng +The DTD used to validate the input file. +.SH SEE ALSO +.I apertium\-gen\-modes\fR(1), +.I apertium\-gen\-oldbil\fR(1), +.I apertium\-interchunk\fR(1), +.I apertium\-validate\-modes\fR(1), +.I apertium\-validate\-interchunk\fR(1), +.I apertium\-validate\-postchunk\fR(1). +.SH BUGS +Lots of them...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005-2007 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-validate-dictionary.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-validate-dictionary.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-validate-dictionary.1 (revision 69632) @@ -0,0 +1,23 @@ +.TH apertium-validate-dictionary 1 2006-03-21 "" "" +.SH NAME +apertium-validate-dictionary \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-validate-dictionary +.SH DESCRIPTION +.BR apertium-validate-dictionary +is a script that validates a dictionary file against +the apertium DTD file for dictionaries using the xmllint utility. + +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-validate-interchunk.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-validate-interchunk.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-validate-interchunk.1 (revision 69632) @@ -0,0 +1,34 @@ +.TH apertium\-validate\-interchunk 1 2006\-03\-11 "" "" +.SH NAME +apertium\-validate\-interchunk \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium open\-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium\-validate\-interchunk + +.SH DESCRIPTION +This is an intermediate tool from Apertium level 2 engine. You should +never have to use it independently. +.PP +It is a script that validates a set of structural interchunk rules +against the apertium structural interchunk rules DTD using the +\fBxmllint\fR utility. +.SH OPTIONS +It has no options. +.SH FILES +.B input_file +A \fIinterchunk.xml\fR file +.PP +.B modes.dtd +The DTD used to validate the input file. +.PP +.SH BUGS +Lots of them...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005-2007 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-validate-modes.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-validate-modes.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-validate-modes.1 (revision 69632) @@ -0,0 +1,39 @@ +.TH apertium\-validate\-modes 1 2006\-03\-11 "" "" +.SH NAME +apertium\-validate\-modes \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium open\-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium\-validate\-modes + +.SH DESCRIPTION +This is an intermediate tool from Apertium level 2 engine. You should +never have to use it independently. +.PP +It is a script that validates a \fImodes.xml\fR file against the +apertium structural modes DTD using the \fBxmllint\fR utility. +.SH OPTIONS +It has no options. +.SH FILES +.B input_file +A \fImodes.xml\fR file +.PP +.B modes.dtd +The DTD used to validate the input file. +.SH SEE ALSO +.I apertium\-gen\-modes\fR(1), +.I apertium\-gen\-oldbil\fR(1), +.I apertium\-interchunk\fR(1), +.I apertium\-validate\-postchunk\fR(1), +.I apertium\-validate\-interchunk\fR(1). +.SH BUGS +Lots of them...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005-2007 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-validate-postchunk.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-validate-postchunk.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-validate-postchunk.1 (revision 69632) @@ -0,0 +1,39 @@ +.TH apertium\-validate\-postchunk 1 2006\-03\-11 "" "" +.SH NAME +apertium\-validate\-postchunk \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium open\-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium\-validate\-postchunk + +.SH DESCRIPTION +This is an intermediate tool from Apertium level 2 engine. You should +never have to use it independently. +.PP +It is a script that validates a set of structural postchunk rules +against the apertium structural postchunk rules DTD using the +\fBxmllint\fR utility. +.SH OPTIONS +It has no options. +.SH FILES +.B input_file +A \fIpostchunk.xml\fR file +.PP +.B postchunk.dtd +The DTD used to validate the input file. +.SH SEE ALSO +.I apertium\-gen\-modes\fR(1), +.I apertium\-gen\-oldbil\fR(1), +.I apertium\-interchunk\fR(1), +.I apertium\-validate\-modes\fR(1), +.I apertium\-validate\-interchunk\fR(1). +.SH BUGS +Lots of them...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005-2007 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-validate-tagger.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-validate-tagger.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-validate-tagger.1 (revision 69632) @@ -0,0 +1,25 @@ +.TH apertium-validate-tagger 1 2006-03-21 "" "" +.SH NAME +apertium-validate-tagger \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation +toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-validate-tagger +.SH DESCRIPTION +.BR apertium-validate-tagger +is a script that checks the validity of a set of rules which +enforce the state to state transition probabilities used by the +part-of-speech tagger. The script uses xmllint to validate +against a DTD. + +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/apertium-validate-transfer.1 =================================================================== --- branches/apertium-tagger/apertium2/apertium/apertium-validate-transfer.1 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/apertium-validate-transfer.1 (revision 69632) @@ -0,0 +1,22 @@ +.TH apertium-validate-transfer 1 2006-03-21 "" "" +.SH NAME +apertium-validate-transfer \- This application is part of +( +.B apertium +) +.PP +This tool is part of the apertium open-source machine translation toolbox: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B apertium-validate-transfer +.SH DESCRIPTION +.BR apertium-validate-transfer +is a script that validates a set of structural transfer rules against +the apertium structural transfer rules DTD using the xmllint utility. + +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante. +This is free software. You may redistribute copies of it under the terms +of the GNU General Public License . + Index: branches/apertium-tagger/apertium2/apertium/deformat-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/deformat-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/deformat-header.sh (revision 69632) @@ -0,0 +1,51 @@ +if [ $# != 2 ] +then if [ $# != 3 ] + then echo "USAGE: $(basename $0) -[aAmM] "; + echo " -a: apertium standard mode"; + echo " -A: apertium optimized mode (default mode)"; + echo " -m: matxin standard mode"; + echo " -M: matxin optimized mode"; + exit 1; + elif [ $1 != "-a" ] && [ $1 != "-A" ] && [ $1 != "-m" ] && [ $1 != "-M" ] + then echo "USAGE: $(basename $0) -[AaMm] "; + echo " -a: apertium standard mode"; + echo " -A: apertium optimized mode (default mode)"; + echo " -m: matxin standard mode"; + echo " -M: matxin optimized mode"; + exit 1; + fi +fi + +FLEXOPTS="" +FILE1=$1; +FILE2=$2; + +if [ $# = 2 ] +then if [ ! -e $1 ] + then echo "ERROR: '$1' file not found"; + exit 1; + fi +fi + +MODE="apertium" # default mode + +if [ $# = 3 ] +then if [ ! -e $2 ] + then echo "ERROR: '$2' file not found"; + exit 1; + fi + + if [ $1 = "-a" ] + then FLEXOPTS=""; + MODE="apertium"; + elif [ $1 = "-m" ] + then FLEXOPTS=""; + MODE="matxin"; + elif [ $1 = "-M" ] + then FLEXOPTS="-Cfer"; + MODE="matxin"; + fi + + FILE1=$2; + FILE2=$3; +fi Index: branches/apertium-tagger/apertium2/apertium/gen-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/gen-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/gen-header.sh (revision 69632) @@ -0,0 +1,30 @@ +if [ $# != 2 ] +then if [ $# != 3 ] + then echo "USAGE: $(basename $0) [-O] "; + exit 1; + elif [ $1 != "-O" ] + then echo "USAGE: $(basename $0) [-O] "; + exit 1; + fi +fi + +FLEXOPTS="" +FILE1=$1; +FILE2=$2; + +if [ $# = 2 ] +then if [ ! -e $1 ] + then echo "ERROR: '$1' file not found"; + exit 1; + fi +fi + +if [ $# = 3 ] +then if [ ! -e $2 ] + then echo "ERROR: '$2' file not found"; + exit 1; + fi + FLEXOPTS="-Cfer"; + FILE1=$2; + FILE2=$3; +fi Index: branches/apertium-tagger/apertium2/apertium/gen-stopwords-lextor.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/gen-stopwords-lextor.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/gen-stopwords-lextor.sh (revision 69632) @@ -0,0 +1,45 @@ + +if [ $# != 3 ] +then echo "USAGE: $(basename $0) " 1>&2 + echo "where is the desired number of stopwords" 1>&2 + echo " contains a large preprocessed corpus" 1>&2 + echo " is the file to which the list of stopwords is written" 1>&2 + exit 1 +fi + +N=$1 +INFILE=$2 +OUTFILE=$3 + +if [ ! -e $INFILE ] +then echo "ERROR: '$INFILE' file not found" 1>&2 + exit 1 +fi + +cat $INFILE |\ +sed -re "s/(\^[0-9·ÀÁÂÄÇÈÉÊËÌÍÎÏŃÒÓÔÖÙÚÛÜàáâäçèéêëìíîïńòóôöùúûüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz <>ç+.,;:_'#*%()?ż!Ą-]+\\$)/\1\n/g" |\ +sed -re "s/^[ \t]+//g" |\ +sed -re "s/[ \t]+$//g" |\ +sed -re "s/^\^//g" |\ +sed -re "s/\\\$$//g" |\ +awk '{if (length($0)>0) print tolower($0)}' |\ +awk '{ #Only lemma and first tag; rest of tags, if present, are ignored + if (index($0,">")>0) + print substr($0,1,index($0,">")); + else + print $0; +}' |\ +sort | uniq -c | sort -n -r |\ +head -n $N |\ +awk 'BEGIN{FS=" "} +{ + c=""; + for(i=2; i<=NF; i++) { + if (length(c)>0) + c= c " " + c = c $i + } + print c; +}' > $OUTFILE + +exit 0 Index: branches/apertium-tagger/apertium2/apertium/gen-wlist-lextor-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/gen-wlist-lextor-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/gen-wlist-lextor-header.sh (revision 69632) @@ -0,0 +1,25 @@ + +if [ $# != 2 ] +then echo "USAGE: $(basename $0) " 1>&2 + echo "where is a lextor monolingual dictionary (.dix) file" 1>&2 + echo "generated with apertium-gen-lextormono" 1>&2 + exit 1 +fi + +if [ ! -e $1 ] +then echo "ERROR: '$1' file not found" 1>&2 + exit 1 +fi + + +$LTTOOLBOX_PATH/lt-expand $1 | grep -v "__REGEXP__" |\ +awk 'BEGIN{FS=":"}{if(index($2,"__")>0) print $1}' |\ +sort | uniq > $2 #|\ +#awk '{ #Only lemma and first tag; rest of tags, if present, are ignored +# if (index($0,">")>0) +# print substr($0,1,index($0,">")); +# else +# print $0; +#}' > $2 + +exit 0 Index: branches/apertium-tagger/apertium2/apertium/preprocess-corpus-lextor.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/preprocess-corpus-lextor.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/preprocess-corpus-lextor.sh (revision 69632) @@ -0,0 +1,54 @@ + +if [ $# != 4 ] +then echo "USAGE: $(basename $0) " 1>&2 + echo "where is the path to the linguistic data to use" 1>&2 + echo " is the translation direction to use" 1>&2 + echo " contains a large corpus in raw format" 1>&2 + echo " is the file to which the preprocessed corpus is written" 1>&2 + exit 1 +fi + +DATA_DIR=$1 +TRANSLATION_DIR=$2 +INFILE=$3 +OUTFILE=$4 + +if [ ! -e $INFILE ] +then echo "ERROR: '$INFILE' file not found" 1>&2 + exit 1 +fi + +if [ ! -e $DATA_DIR/$TRANSLATION_DIR.automorf.bin ] +then echo "ERROR: '$DATA_DIR/$TRANSLATION_DIR.automorf.bin' file not found" 1>&2 + exit 1 +fi + +if [ ! -e $DATA_DIR/$TRANSLATION_DIR.prob ] +then echo "ERROR: '$DATA_DIR/$TRANSLATION_DIR.prob' file not found" 1>&2 + exit 1 +fi + + +cat $INFILE | $APERTIUM_PATH/apertium-destxt |\ +$LTTOOLBOX_PATH/lt-proc -a $DATA_DIR/$TRANSLATION_DIR.automorf.bin |\ +$APERTIUM_PATH/apertium-tagger -g $DATA_DIR/$TRANSLATION_DIR.prob |\ +$APERTIUM_PATH/apertium-pretransfer |\ +$APERTIUM_PATH/apertium-retxt |\ +awk 'BEGIN{FS="\\$"} #Discards characters not belonging to apertium words +{ + c=""; + for (j=1; j<=NF; j++) { + w=$j; + w=substr(w,index(w,"^")); + + if ((length(w)>0) && (index(w,"^")>0)) { + if (length(c)>0) + c = c " "; + c = c w "$"; + } + } + + print c; +}' > $OUTFILE + +exit 0 Index: branches/apertium-tagger/apertium2/apertium/script_header.sh.cmake_in =================================================================== --- branches/apertium-tagger/apertium2/apertium/script_header.sh.cmake_in (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/script_header.sh.cmake_in (revision 69632) @@ -0,0 +1,7 @@ +#!${BASH} + +export APERTIUM_PATH=${APERTIUM_PATH} +export LTTOOLBOX_PATH=${LTTOOLBOX_PATH} +export DEFAULT_DIRECTORY=${DEFAULT_DIRECTORY} + +${SCRIPT} Property changes on: branches/apertium-tagger/apertium2/apertium/script_header.sh.cmake_in ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/apertium-tagger/apertium2/apertium/trans-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/trans-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/trans-header.sh (revision 69632) @@ -0,0 +1,107 @@ +case $# in + 2) + DATOS=$1 + PREFIJO=$2 + FORMATADOR=txt + ;; + 3) + DATOS=$1 + PREFIJO=$2 + FORMATADOR=$3 + ;; + 4) + DATOS=$1 + PREFIJO=$2 + FORMATADOR=$3 + FICHERO=$4 + ;; + 5) + DATOS=$1 + PREFIJO=$2 + FORMATADOR=$3 + FICHERO=$4 + SALIDA=$5 + ;; + *) + echo "USAGE: $(basename $0) [format [infile [outfile]]]" + echo " datadir Directory of linguistic data" + echo " translation LANG1-LANG2" + echo " format one of: txt (default), txtu, html, htmlu, rtf, rtfu" + echo " infile input file (stdin by default)" + echo " outfile output file (stdout by default)" + exit 1; +esac + +#Parámetros obligatorios +PREFIJO=$2 #Dirección traducción Ejm.- es-ca +FORMATADOR=$3 #Fuente a traducir Ejm.- txt + +DATOS=$1 + +#Parametro opcional, de no estar, lee de la entrada estandar (stdin) +FICHERO=$4 #Fichero con el texto a traducir + +PATH=.:/usr/local/bin:$PATH +AUTOMORF=$DATOS/$PREFIJO.automorf.bin +AUTOBIL=$DATOS/$PREFIJO.autobil.bin +#AUTOBIL=$DATOS/$PREFIJO.lextorbil.bin +AUTOGEN=$DATOS/$PREFIJO.autogen.bin +AUTOPGEN=$DATOS/$PREFIJO.autopgen.bin + +DEP="dep" + +TURL="cat" #No hace nada, se introduce para no tener + #que cambiar la línea de montaje, pues en algunos + #casos se usa como ultimo eslabón de la cadena el + #programa turl o ext-turl. +REF= + +case "$FORMATADOR" in + txt) + FORMATADOR="txt" + GENERADOR="lt-proc -g" + ;; + txtu) + FORMATADOR="txt" + GENERADOR="lt-proc -n" + ;; + rtf) + FORMATADOR="rtf" + GENERADOR="lt-proc -g" + ;; + rtfu) + FORMATADOR="rtf" + GENERADOR="lt-proc -n" + ;; + html) + FORMATADOR="html" + GENERADOR="lt-proc -g" + ;; + htmlu) + FORMATADOR="html" + GENERADOR="lt-proc -n" + ;; + *) # Por defecto asumimos txt + FORMATADOR="txt" + GENERADOR="lt-proc -g" + ;; +esac + +if [ -z $REF ] +then + REF=$FORMATADOR +fi + +$APERTIUM_PATH/apertium-des$FORMATADOR $FICHERO | \ +$LTTOOLBOX_PATH/lt-proc $AUTOMORF | \ +$APERTIUM_PATH/apertium-tagger -g $DATOS/$PREFIJO.prob | \ +$APERTIUM_PATH/apertium-pretransfer | \ +#$APERTIUM_PATH/apertium-lextor -l $DATOS/$PREFIJO.lextor $DATOS/$PREFIJO.lextormono.bin 3 3 | \ +$APERTIUM_PATH/apertium-transfer $DATOS/trules-$PREFIJO.xml $DATOS/trules-$PREFIJO.bin $AUTOBIL | \ +$LTTOOLBOX_PATH/$GENERADOR $AUTOGEN | \ +$LTTOOLBOX_PATH/lt-proc -p $AUTOPGEN | \ +if [ x$SALIDA = x ] +then $APERTIUM_PATH/apertium-re$FORMATADOR +else + $APERTIUM_PATH/apertium-re$FORMATADOR >$SALIDA +fi Index: branches/apertium-tagger/apertium2/apertium/trans-lextor-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/trans-lextor-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/trans-lextor-header.sh (revision 69632) @@ -0,0 +1,107 @@ +case $# in + 2) + DATOS=$1 + PREFIJO=$2 + FORMATADOR=txt + ;; + 3) + DATOS=$1 + PREFIJO=$2 + FORMATADOR=$3 + ;; + 4) + DATOS=$1 + PREFIJO=$2 + FORMATADOR=$3 + FICHERO=$4 + ;; + 5) + DATOS=$1 + PREFIJO=$2 + FORMATADOR=$3 + FICHERO=$4 + SALIDA=$5 + ;; + *) + echo "USAGE: $(basename $0) [format [infile [outfile]]]" + echo " datadir Directory of linguistic data" + echo " translation LANG1-LANG2" + echo " format one of: txt (default), txtu, html, htmlu, rtf, rtfu" + echo " infile input file (stdin by default)" + echo " outfile output file (stdout by default)" + exit 1; +esac + +#Parámetros obligatorios +PREFIJO=$2 #Dirección traducción Ejm.- es-ca +FORMATADOR=$3 #Fuente a traducir Ejm.- txt + +DATOS=$1 + +#Parametro opcional, de no estar, lee de la entrada estandar (stdin) +FICHERO=$4 #Fichero con el texto a traducir + +PATH=.:/usr/local/bin:$PATH +AUTOMORF=$DATOS/$PREFIJO.automorf.bin +#AUTOBIL=$DATOS/$PREFIJO.autobil.bin +AUTOBIL=$DATOS/$PREFIJO.lextorbil.bin +AUTOGEN=$DATOS/$PREFIJO.autogen.bin +AUTOPGEN=$DATOS/$PREFIJO.autopgen.bin + +DEP="dep" + +TURL="cat" #No hace nada, se introduce para no tener + #que cambiar la línea de montaje, pues en algunos + #casos se usa como ultimo eslabón de la cadena el + #programa turl o ext-turl. +REF= + +case "$FORMATADOR" in + txt) + FORMATADOR="txt" + GENERADOR="lt-proc -g" + ;; + txtu) + FORMATADOR="txt" + GENERADOR="lt-proc -n" + ;; + rtf) + FORMATADOR="rtf" + GENERADOR="lt-proc -g" + ;; + rtfu) + FORMATADOR="rtf" + GENERADOR="lt-proc -n" + ;; + html) + FORMATADOR="html" + GENERADOR="lt-proc -g" + ;; + htmlu) + FORMATADOR="html" + GENERADOR="lt-proc -n" + ;; + *) # Por defecto asumimos txt + FORMATADOR="txt" + GENERADOR="lt-proc -g" + ;; +esac + +if [ -z $REF ] +then + REF=$FORMATADOR +fi + +$APERTIUM_PATH/apertium-des$FORMATADOR $FICHERO | \ +$LTTOOLBOX_PATH/lt-proc $AUTOMORF | \ +$APERTIUM_PATH/apertium-tagger -g $DATOS/$PREFIJO.prob | \ +$APERTIUM_PATH/apertium-pretransfer | \ +$APERTIUM_PATH/apertium-lextor -l $DATOS/$PREFIJO.lextor $DATOS/$PREFIJO.lextormono.bin 3 3 | \ +$APERTIUM_PATH/apertium-transfer $DATOS/trules-$PREFIJO.xml $DATOS/trules-$PREFIJO.bin $AUTOBIL | \ +$LTTOOLBOX_PATH/$GENERADOR $AUTOGEN | \ +$LTTOOLBOX_PATH/lt-proc -p $AUTOPGEN | \ +if [ x$SALIDA = x ] +then $APERTIUM_PATH/apertium-re$FORMATADOR +else + $APERTIUM_PATH/apertium-re$FORMATADOR >$SALIDA +fi Index: branches/apertium-tagger/apertium2/apertium/transformdic-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/transformdic-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transformdic-header.sh (revision 69632) @@ -0,0 +1,20 @@ +if [ $# != 3 ] +then echo "USAGE: $(basename $0) lr|rl "; + exit 1; +fi + +FILE1=$2; +FILE2=$3; + +if [ ! -e $2 ] +then echo "ERROR: '$1' file not found"; + exit 1; +fi + +if [ $1 = "lr" ] +then xsltproc $XSLTPROC_OPTIONS_LR $STYLESHEET $FILE1 >$FILE2 +elif [ $1 = "rl" ] +then xsltproc $XSLTPROC_OPTIONS_RL $STYLESHEET $FILE1 >$FILE2 +else + echo "ERROR: $1 option invalid"; +fi Index: branches/apertium-tagger/apertium2/apertium/transformdicbil-header.sh =================================================================== --- branches/apertium-tagger/apertium2/apertium/transformdicbil-header.sh (nonexistent) +++ branches/apertium-tagger/apertium2/apertium/transformdicbil-header.sh (revision 69632) @@ -0,0 +1,14 @@ +if [ $# != 2 ] +then echo "USAGE: $(basename $0) "; + exit 1; +fi + +FILE1=$1; +FILE2=$2; + +if [ ! -e $1 ] +then echo "ERROR: '$1' file not found"; + exit 1; +fi + +xsltproc $XSLTPROC_OPTIONS $STYLESHEET $FILE1 >$FILE2 Index: branches/apertium-tagger/apertium2/apertium =================================================================== --- branches/apertium-tagger/apertium2/apertium (nonexistent) +++ branches/apertium-tagger/apertium2/apertium (revision 69632) Property changes on: branches/apertium-tagger/apertium2/apertium ___________________________________________________________________ Added: svn:ignore ## -0,0 +1,43 ## +.deps +.libs +Makefile +Makefile.in +stamp-* +apertium +apertium-[dfgilprtv]*[!1] +apertium_config.h +apertium_config.h.in +apertium_deshtml.cc +apertium_deslatex.cc +apertium_desmediawiki.cc +apertium_desodt.cc +apertium_despptx.cc +apertium_desrtf.cc +apertium_destxt.cc +apertium_deswxml.cc +apertium_desxlsx.cc +apertium_desxpresstag.cc +apertium-multiple-translations +apertium_postlatex.cc +apertium_postlatex_raw.cc +apertium_prelatex.cc +apertium_rehtml.cc +apertium_rehtml_noent.cc +apertium_relatex.cc +apertium_remediawiki.cc +apertium_reodt.cc +apertium_repptx.cc +apertium_rertf.cc +apertium_retxt.cc +apertium_rewxml.cc +apertium_rexlsx.cc +apertium_rexpresstag.cc +apertium-unformat +apertium-utils-fixlatex +dix.rnc +format.rnc +interchunk.rnc +modes.rnc +postchunk.rnc +tagger.rnc +transfer.rnc Index: branches/apertium-tagger/apertium2/tests/tagger/__init__.py =================================================================== --- branches/apertium-tagger/apertium2/tests/tagger/__init__.py (nonexistent) +++ branches/apertium-tagger/apertium2/tests/tagger/__init__.py (revision 69632) @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import functools +import unittest +import tempfile +from os.path import join as pjoin +from os.path import abspath, dirname +from subprocess import (check_call, check_output, Popen, PIPE, DEVNULL, + TimeoutExpired, CalledProcessError) + + +# Utilities +def tmp(contents): + t = tempfile.NamedTemporaryFile(mode='w', delete=False) + t.write(contents) + return t.name + + +def rel(fn): + return abspath(pjoin(dirname(abspath(__file__)), fn)) + + +APERTIUM_TAGGER = rel("../../apertium/apertium-tagger") + + +def check_stderr(*popenargs, timeout=None, **kwargs): + # Essentially a copypasted version of check_output. + # Can be significantly abridged with Python 3.5's run(...) + if 'stderr' in kwargs: + raise ValueError('stderr argument not allowed, it will be overridden.') + if 'input' in kwargs: + if 'stdin' in kwargs: + raise ValueError('stdin and input arguments may not both be used.') + inputdata = kwargs['input'] + del kwargs['input'] + kwargs['stdin'] = PIPE + else: + inputdata = None + with Popen(*popenargs, stderr=PIPE, **kwargs) as process: + try: + unused_output, err = process.communicate(inputdata, + timeout=timeout) + except TimeoutExpired: + process.kill() + unused_output, err = process.communicate() + raise TimeoutExpired(process.args, timeout, output=err) + except: + process.kill() + process.wait() + raise + retcode = process.poll() + if retcode: + raise CalledProcessError(retcode, process.args, output=err) + return err + + +def trace_dec(f): + @functools.wraps(f) + def inner(*args, **kwargs): + if len(args) > 0: + print("run " + " ".join(args[0])) + return f(*args, **kwargs) + return inner + + +def trace_plus_unicode(f): + return functools.partial(trace_dec(f), universal_newlines=True) + +check_call = trace_plus_unicode(check_call) +check_output = trace_plus_unicode(check_output) +check_stderr = trace_plus_unicode(check_stderr) + +# Test files +DIC = """ +^the/the$ +^books/book/book$ +^has/have$ +^booked/book/book$ +^close/close/close/close/close/close$ +^cat/cat$ +^room/room$ +^red/red$ +^./.$ +""".strip() + +TSX = """ + + + + + + + + + + + + + + + + + + + + +""".strip() + +TRAIN_NO_PROBLEM_UNTAGGED = """ +^The/the$ +^cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book/book$ +^the/the$ +^red/red$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +TRAIN_NO_PROBLEM_TAGGED = """ +^The/the$ +^cat/cat$ +^books/book$ +^the/the$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book$ +^the/the$ +^red/red$ +^room/room$ +^./.$ + +^The/the$ +^red/red$ +^cat/cat$ +^books/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +TRAIN_CAT_TO_BE_A_VERB_UNTAGGED = """ +^The/The$ +^falling/fall/fall/fall$ +^cat/cat$ +^has/have$ +^booked/book/book$ +^books/book/book$ +^./.$ + +^Close/close/close/close/close/close$ +^the/the$ +^books/book/book$ +^./.$ + +^The/the$ +^falling/fall/fall/fall$ +^cat/cat$ +^has/have$ +^books/book/book$ +^./.$ +""".strip() + +TRAIN_CAT_TO_BE_A_VERB_TAGGED = """ +^The/The$ +^falling/fall$ +^cat/cat$ +^has/have$ +^booked/book$ +^books/book$ +^./.$ + +^Close/close$ +^the/the$ +^books/book$ +^./.$ + +^The/the$ +^falling/fall$ +^cat/cat$ +^has/have$ +^books/book$ +^./.$ +""".strip() + +TEST_SUCCESS = """ +^The/the$ +^cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +TEST_NEW_AMBG_CLASS = """ +^The/the$ +^cat/cat/cat$ +^books/book/book$ +^the/the$ +^room/room$ +^./.$ +""".strip() + +# Expected strings +EXPECTED_SUBST = """ +Error: A new ambiguity class was found. +Retraining the tagger is necessary so as to take it into account. +Word 'cat'. +New ambiguity class: {NOUN,ADJ} +""".strip().split("\n") + + +# Tests +class AmbiguityClassTest(unittest.TestCase): + def setUp(self): + self.tsx_fn = tmp(TSX) + self.dic_fn = tmp(DIC) + + def changing_class_impl(self, flags, model_fn): + test1 = tmp(TEST_SUCCESS) + test2 = tmp(TEST_NEW_AMBG_CLASS) + success_stderr = check_stderr( + [APERTIUM_TAGGER, '-d'] + flags + + ['-g', model_fn, test1], + stdout=DEVNULL) + self.assertEqual(success_stderr.strip(), "") + subst_stderr = check_stderr( + [APERTIUM_TAGGER, '-d'] + flags + + ['-g', model_fn, test2], + stdout=DEVNULL) + subst_stderr = [line.strip() + for line in subst_stderr.strip().split("\n")] + self.assertEqual(subst_stderr, EXPECTED_SUBST) + ambg_class = check_output( + [rel('test-find-similar-ambiguity-class'), model_fn], + input="NOUN ADJ\n") + substituted_class = set(ambg_class.split(" ")) + # Should get open class + self.assertSetEqual(substituted_class, set(("VERB", "NOUN", "ADJ"))) + + def test_changing_class_hmm_sup(self): + model_fn = tmp("") + untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + tagged = tmp(TRAIN_NO_PROBLEM_TAGGED) + check_call( + [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn, + model_fn, tagged, untagged]) + self.changing_class_impl([], model_fn) + + def test_changing_class_hmm_unsup(self): + model_fn = tmp("") + untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + check_call( + [APERTIUM_TAGGER, '-t', '1', self.dic_fn, untagged, self.tsx_fn, + model_fn]) + self.changing_class_impl([], model_fn) + + def test_changing_class_sliding_window(self): + model_fn = tmp("") + untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) + check_call( + [APERTIUM_TAGGER, '--sliding-window', '-t', '1', self.dic_fn, + untagged, self.tsx_fn, model_fn]) + self.changing_class_impl(['--sliding-window'], model_fn) + + def test_cat_is_a_verb(self): + model_fn = tmp("") + untagged = tmp(TRAIN_CAT_TO_BE_A_VERB_UNTAGGED) + tagged = tmp(TRAIN_CAT_TO_BE_A_VERB_TAGGED) + new_ambg_class = tmp(TEST_NEW_AMBG_CLASS) + check_call( + [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn, + model_fn, tagged, untagged]) + subst_stdout = check_output( + [APERTIUM_TAGGER, '-d', '-g', model_fn, new_ambg_class], + stderr=DEVNULL) + acceptable = False + for line in subst_stdout.split("\n"): + if (line.startswith('^cat') and ('' in line or '' in line)): + acceptable = True + self.assertTrue( + acceptable, + "'cat' must be output and tagged as an adjective or a noun.\n" + + "Actual output:\n{}".format(subst_stdout)) Property changes on: branches/apertium-tagger/apertium2/tests/tagger/__init__.py ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/apertium-tagger/apertium2/tests/tagger/Makefile.am =================================================================== --- branches/apertium-tagger/apertium2/tests/tagger/Makefile.am (nonexistent) +++ branches/apertium-tagger/apertium2/tests/tagger/Makefile.am (revision 69632) @@ -0,0 +1,14 @@ +library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME) + +bin_PROGRAMS = test-find-similar-ambiguity-class +bin_SCRIPTS = $(GENERATEDSCRIPTS) + +AM_CPPFLAGS = -I$(top_srcdir) + +apertiumdir = $(prefix)/share/apertium +apertiuminclude = $(prefix)/include/apertium-$(GENERIC_API_VERSION) +apertiumlib = $(prefix)/lib +apertiumsysconf = $(prefix)/etc/apertium + +test_find_similar_ambiguity_class_SOURCES = test_find_similar_ambiguity_classes.cc +test_find_similar_ambiguity_class_LDADD = -L$(top_srcdir)/$(GENERIC_LIBRARY_NAME)/.libs/ $(APERTIUM_LIBS) -l$(GENERIC_LIBRARY_NAME)$(GENERIC_MAJOR_VERSION) Index: branches/apertium-tagger/apertium2/tests/tagger/test_find_similar_ambiguity_classes.cc =================================================================== --- branches/apertium-tagger/apertium2/tests/tagger/test_find_similar_ambiguity_classes.cc (nonexistent) +++ branches/apertium-tagger/apertium2/tests/tagger/test_find_similar_ambiguity_classes.cc (revision 69632) @@ -0,0 +1,61 @@ +#include "apertium/utf_converter.h" +#include "apertium/tagger_utils.h" +#include "apertium/tagger_data_hmm.h" +#include "apertium/tagger_data.h" +#include +#include +#include +#include + +void print_ambiguity_class(const vector &array_tags, const set &abgset) +{ + unsigned int j; + set::const_iterator abgseti; + for (abgseti=abgset.begin(), j=0; abgseti!=abgset.end(); abgseti++, j++) { + wcout << array_tags[*abgseti]; + if (j < abgset.size() - 1) { + wcout << " "; + } + } +} + +void find_similar_ambiguity_class_io(TaggerData &td) +{ + vector &array_tags = td.getArrayTags(); + wstring line = L""; + getline(wcin, line, L'\n'); + + wstringstream line_stream(line); + set ambiguity_class; + wstring tag_name; + while (line_stream >> tag_name) { + vector::iterator it; + it = find(array_tags.begin(), array_tags.end(), tag_name); + if (it == array_tags.end()) { + wcerr << L"Tag not in model: " << tag_name << L'\n'; + exit(-3); + } + ambiguity_class.insert(it - array_tags.begin()); + } + set similar_ambiguity_class = tagger_utils::find_similar_ambiguity_class(td, ambiguity_class); + print_ambiguity_class(array_tags, similar_ambiguity_class); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) { + cerr<<"Usage: "<\n"; + exit(-1); + } + char* probfile = argv[1]; + TaggerDataHMM tagger_data_hmm; + FILE* fin = fopen(probfile, "r"); + if (!fin) { + cerr<<"Error: cannot open file '"<$", "^a+c$", "^a# b$", "[

]^a$", "[
]^a# b$"] + expectedOutputs = ["^a$", "^a$ ^c$", "^a# b$", "[
]^a$", "[
]^a# b$"] + +class JoinGroupPretransferTest(PretransferTest): + inputs = ["[
]^a+c# b$", "[
]^a+c+d# b$"] + expectedOutputs = ["[
]^a# b$ ^c$", "[
]^a# b$ ^c$ ^d$"] + + +# Proposed inline blank format: +class InlineBlankPretransferTest(PretransferTest): + inputs = ["[{}]^a+c# b$", "[{}]^a+c+d# b$"] + expectedOutputs = ["[{}]^a# b$ [{}]^c$", "[{}]^a# b$ [{}]^c$ [{}]^d$"] + @unittest.expectedFailure + def runTest(self): + super().runTest(self) Index: branches/apertium-tagger/apertium2/tests/README =================================================================== --- branches/apertium-tagger/apertium2/tests/README (nonexistent) +++ branches/apertium-tagger/apertium2/tests/README (revision 69632) @@ -0,0 +1,7 @@ +Tests require python3, run like + + python3 tests/run_tests.py + +You may have to do "(sudo) make install" once before running the tests. + +They should all pass. Index: branches/apertium-tagger/apertium2/.gitignore =================================================================== --- branches/apertium-tagger/apertium2/.gitignore (nonexistent) +++ branches/apertium-tagger/apertium2/.gitignore (revision 69632) @@ -0,0 +1,88 @@ +*.la +*.lo +*.o +*.pyc + +**.deps/ +**.dirstamp + +# / +/autom4te.cache + +/compile +/config.guess +/config.status +/config.sub +/configure +/depcomp +/install-sh +/libtool +/ltmain.sh +/missing + +/aclocal.m4 +/config.log +/INSTALL +/Makefile +/Makefile.in + +/*.pc + +# /apertium/wildcard +/apertium/.libs + +/apertium/apertium +/apertium/apertium-*[!1] + +!/apertium/apertium-createmodes.awk +!/apertium/apertium-header.sh +!/apertium/apertium-multiple-translations.cc +!/apertium/apertium-unformat-header.sh + + +/apertium/apertium_config.h +/apertium/apertium_config.h.in +/apertium/apertium_config.h.in~ +/apertium/apertium_deshtml.cc +/apertium/apertium_deslatex.cc +/apertium/apertium_desmediawiki.cc +/apertium/apertium_desodt.cc +/apertium/apertium_despptx.cc +/apertium/apertium_desrtf.cc +/apertium/apertium_destxt.cc +/apertium/apertium_deswxml.cc +/apertium/apertium_desxlsx.cc +/apertium/apertium_desxpresstag.cc +/apertium/apertium_postlatex.cc +/apertium/apertium_postlatex_raw.cc +/apertium/apertium_prelatex.cc +/apertium/apertium_rehtml.cc +/apertium/apertium_rehtml_noent.cc +/apertium/apertium_relatex.cc +/apertium/apertium_remediawiki.cc +/apertium/apertium_reodt.cc +/apertium/apertium_repptx.cc +/apertium/apertium_rertf.cc +/apertium/apertium_retxt.cc +/apertium/apertium_rewxml.cc +/apertium/apertium_rexlsx.cc +/apertium/apertium_rexpresstag.cc + +/apertium/dix.rnc +/apertium/format.rnc +/apertium/interchunk.rnc +/apertium/Makefile +/apertium/Makefile.in +/apertium/modes.rnc +/apertium/postchunk.rnc +/apertium/tagger.rnc +/apertium/transfer.rnc + +/apertium/stamp-* + +# Tests +/tests/**/Makefile.in +/tests/**/Makefile +/tests/**/.libs + +/tests/tagger/test-find-similar-ambiguity-class Index: branches/apertium-tagger/apertium2/Makefile.am =================================================================== --- branches/apertium-tagger/apertium2/Makefile.am (nonexistent) +++ branches/apertium-tagger/apertium2/Makefile.am (revision 69632) @@ -0,0 +1,19 @@ +SUBDIRS = $(GENERIC_LIBRARY_NAME) tests +DIST_SUBDIRS = $(GENERIC_LIBRARY_NAME) tests + +modesdir=$(prefix)/share/apertium/modes + +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = apertium.pc + +aclocaldir = $(datadir)/aclocal +aclocal_DATA = apertium.m4 + +EXTRA_DIST=autogen.sh README-MODES apertium.m4 + +install-data-local: + mkdir -p $(DESTDIR)$(modesdir) + $(INSTALL_DATA) README-MODES $(DESTDIR)$(modesdir)/README + +test: tests/run_tests.py + $< Index: branches/apertium-tagger/apertium2/configure.ac =================================================================== --- branches/apertium-tagger/apertium2/configure.ac (nonexistent) +++ branches/apertium-tagger/apertium2/configure.ac (revision 69632) @@ -0,0 +1,189 @@ +# -*- Autoconf -*- +# Process this file with autoconf to produce a configure script. + +AC_PREREQ(2.52) + +m4_define([required_lttoolbox_version], [3.3.3]) +m4_define([required_libxml_version], [2.6.17]) +m4_define([required_libpcre_version], [6.4]) +#m4_define([required_pkg_config_version], [0.15]) + +AC_INIT([apertium], [3.4.2], [sortiz@users.sourceforge.net]) +AC_CONFIG_HEADER([apertium/apertium_config.h]) + +AC_CANONICAL_SYSTEM + +GENERIC_LIBRARY_NAME=apertium + +# Release versioning +GENERIC_MAJOR_VERSION=3 +GENERIC_MINOR_VERSION=4 +GENERIC_MICRO_VERSION=0 + +# API version (often = GENERIC_MAJOR_VERSION.GENERIC_MINOR_VERSION) +GENERIC_API_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION +AC_SUBST(GENERIC_API_VERSION) +AC_SUBST(GENERIC_MAJOR_VERSION) + +# Shared library versioning +GENERIC_LIBRARY_VERSION=0:0:0 +# | | | +# +------+ | +---+ +# | | | +# current:revision:age +# | | | +# | | +- increment if interfaces have been added +# | | set to zero if interfaces have been removed +# or changed +# | +- increment if source code has changed +# | set to zero if current is incremented +# +- increment if interfaces have been added, removed or changed + +AC_SUBST(GENERIC_LIBRARY_VERSION) +PACKAGE=$GENERIC_LIBRARY_NAME +AC_SUBST(GENERIC_LIBRARY_NAME) + +GENERIC_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION.$GENERIC_MICRO_VERSION +GENERIC_RELEASE=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION +AC_SUBST(GENERIC_RELEASE) +AC_SUBST(GENERIC_VERSION) + +VERSION=$GENERIC_VERSION + +AM_INIT_AUTOMAKE(no-define) + +AC_PROG_CXX +AC_PROG_LIBTOOL +AM_SANITY_CHECK +AC_LANG_CPLUSPLUS + +CFLAGS="-Wall -Wextra $CFLAGS" +CXXFLAGS="-Wall -Wextra $CXXFLAGS" + +AC_ARG_ENABLE(debug, + [ --enable-debug Enable "-g -Wall" compiler options], + [CXXFLAGS="-g -Wall"; CFLAGS="-g -Wall"; AC_DEFINE([ENABLE_DEBUG], [1], [ENABLE_DEBUG])]) + +AC_ARG_ENABLE(profile, + [ --enable-profile Enable "-pg -g -Wall" compiler options], + [CXXFLAGS="-pg -g -Wall"; CFLAGS="-pg -g -Wall"; LDFLAGS="-pg"]) + + +AC_PATH_PROG(XMLLINT, xmllint, no) +if test x$ac_cv_path_XMLLINT = x +then + AC_MSG_ERROR([You don't have xmllint installed.]) +fi +if test x$ac_cv_path_XMLLINT = xno +then + AC_MSG_ERROR([You don't have xmllint installed.]) +fi + + AC_PATH_PROG(XSLTPROC, xsltproc, no) + if test x$ac_cv_path_XSLTPROC = x + then + AC_MSG_ERROR([You don't have xsltproc installed.]) + fi + if test x$ac_cv_path_XSLTPROC = xno + then + AC_MSG_ERROR([You don't have xsltproc installed.]) + fi + +AC_PATH_PROG(BASH, bash, no) +if test x$ac_cv_path_BASH = x +then + AC_MSG_ERROR([You don't have bash installed.]) +fi +if test x$ac_cv_path_BASH = xno +then + AC_MSG_ERROR([You don't have bash installed.]) +fi + +AC_PATH_PROG(FLEX, flex, no) +if test x$ac_cv_path_FLEX = x +then + AC_MSG_ERROR([You don't have flex installed.]) +fi +if test x$ac_cv_path_FLEX = xno +then + AC_MSG_ERROR([You don't have flex installed.]) +fi + +AC_PATH_PROG(PKG_CONFIG, pkg-config, no) +if test x$ac_cv_path_PKG_CONFIG = x +then + AC_MSG_ERROR([You don't have pkg-config installed.]) +fi +if test x$ac_cv_path_PKG_CONFIG = xno +then + AC_MSG_ERROR([You don't have pkg-config installed.]) +fi + +AC_CHECK_FUNCS(strcasecmp) + +if test x$(uname) != xDarwin; +then +AC_CHECK_HEADER(pcreposix.h, + AC_CHECK_LIB(pcre, pcre_fullinfo,[ + LIBS="$LIBS -lpcreposix -lpcre" + no_comp_check=yes], + AC_MSG_ERROR([*** unable to locate pcre library ***])), + AC_MSG_ERROR([*** unable to locate pcreposix.h include file ***])) + +AC_CHECK_HEADER(pcrecpp.h, + AC_CHECK_LIB(pcrecpp,pcre_compile,[ + LIBS="$LIBS -lpcrecpp" + no_comp_check=yes], + AC_MSG_ERROR([*** unable to locate pcrecpp library ***])), + AC_MSG_ERROR([*** unable to locate pcrecpp.h include file ***])) +fi + + +PKG_CHECK_MODULES(APERTIUM, [ + lttoolbox >= required_lttoolbox_version + libxml-2.0 >= required_libxml_version + libpcre >= required_libpcre_version], CPPFLAGS="$CPPFLAGS $APERTIUM_CFLAGS"; LIBS="$LIBS $APERTIUM_LIBS") + +# Check for wide strings +AC_DEFUN([AC_CXX_WSTRING],[ + AC_CACHE_CHECK(whether the compiler supports wide strings, + ac_cv_cxx_wstring, + [AC_LANG_SAVE + AC_LANG_CPLUSPLUS + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]],[[ +std::wstring test = L"test"; + ]])], + [ac_cv_cxx_wstring=yes], [ac_cv_cxx_wstring=no]) + AC_LANG_RESTORE + ]) +]) + +AC_CXX_WSTRING +AC_C_BIGENDIAN + +if test "$ac_cv_cxx_wstring" = no +then + AC_MSG_ERROR([Missing wide string support]) +fi + + +# Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h]) + +# Checks for typedefs, structures, and compiler characteristics. +AC_HEADER_STDBOOL +AC_C_CONST +AC_TYPE_SIZE_T + +# Checks for library functions. +AC_FUNC_ERROR_AT_LINE + +AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, getopt, getopt_long, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked]) +AC_CHECK_FUNCS([setlocale strdup getopt snprintf mbtowc]) +AC_REPLACE_FUNCS(getopt_long) + +AM_CONDITIONAL([WINDOWS], [test x$version_type = xwindows]) +AS_IF([test x$version_type = xwindows], [AC_DEFINE(HAVE_GETOPT_LONG,0)], []) + +AC_OUTPUT([Makefile apertium.pc apertium/Makefile tests/Makefile tests/tagger/Makefile]) Index: branches/apertium-tagger/apertium2/Jenkinsfile =================================================================== --- branches/apertium-tagger/apertium2/Jenkinsfile (nonexistent) +++ branches/apertium-tagger/apertium2/Jenkinsfile (revision 69632) @@ -0,0 +1,10 @@ +node { + stage 'Checkout' + checkout scm + + stage 'Build' + sh "./autogen.sh && make clean && make" + + stage 'Test' + sh "make test" +} Index: branches/apertium-tagger/apertium2/NEWS =================================================================== --- branches/apertium-tagger/apertium2/NEWS (nonexistent) +++ branches/apertium-tagger/apertium2/NEWS (revision 69632) @@ -0,0 +1,128 @@ + +=================== + NEWS for apertium +=================== + +SVN +--- + +Version 3.4.2, 2016-05-15 (-r68437) +--------------------------------- + +* some bugfixes to apertium-tagger, e.g. + https://sourceforge.net/p/apertium/tickets/94/ + +* bugfixes to modes: now accept dirs with spaces, and allow installing apertium + itself and language data to different prefixes, as well as auto-generating + debug modes + +* fix a crash when apertium-tagger is compiled with with clang + +* new option -n to deformatters turns off dot-insertion + http://sourceforge.net/p/apertium/tickets/68 + +* new transfer instruction ; + see transfer.dtd for details (not implemented for + interchunk/postchunk) + +* apertium-transfer-tools-generalisation-dev branch merged; outputs + extra trace information from transfer for generalising + corpus-generated transfer rules + +* apertium-tagger: supervised training and tagging for unigram models + based on http://coltekin.net/cagri/papers/trmorph-tools.pdf + +* fix some off-by-one/out-of-bounds segfaults in transfer + https://sourceforge.net/p/apertium/tickets/89/ + +* various distribution-related fixes, static analysis fixes, + documentation + +Version 3.4.0, 2015-03-17 (-r59200) +--------------------------------- + +* transfer files now work even if they were compiled with a different + version of pcre + +* more explicit validation checks on .dix compilation + +* various fixes to driver script: + + * `apertium -d . -l` behaves as expected now + + * `>>` no longer empties out the destination file + + * safer variable quoting + +* some Windows Unicode fixes + +* tagger now resets its state after a flush + + +Version 3.3, 2014-08-20 (-r56825) +--------------------------------- + +* new Light Sliding Window Part-of-Speech Tagger (GsoC project merged) + +* new LaTeX format handler + +* new html-noent format handler (html without turning non-ASCII into entities) + +* bilingual lookup can now be separate from transfer + + * see new -b options to lt-proc/apertium-transfer + +* apertium.m4 now available for language pairs to simplify build rules + and depend on monolingual data + +* some memory leaks and many minor bugs fixed + + * pretransfer now allows '+' inside tags + +* --trace modes for transfer + + +Version 3.2, 2010-09-21 (-r25741) +--------------------------------- + +* Fixed some bugs in pretransfer, allow '+' inside tags + +* Updated the DTDs to allow comments anywhere + + +Version 3.1, 2008-09-29 +----------------------- + + +Version 3.0, 2008-08-01 +----------------------- + +* Debian package + + +Version 2.0, 2007-06-19 +----------------------- + + +Version 1.9, 2006-12-15 +----------------------- + + +Version 1.0, 2006-10-02 +----------------------- + +* Debian package + + +Version 0.9, 2005-09-29 +----------------------- + + +Version 0.8, 2005-08-01 +----------------------- + + + +# Local Variables: +# mode: markdown +# End: Index: branches/apertium-tagger/apertium2/apertium.m4 =================================================================== --- branches/apertium-tagger/apertium2/apertium.m4 (nonexistent) +++ branches/apertium-tagger/apertium2/apertium.m4 (revision 69632) @@ -0,0 +1,155 @@ +# apertium.m4 - Macros to locate and utilise apertium libraries -*- Autoconf -*- +# serial 1 (apertium-3.4.2) +# +# Copyright (C) 2013--2016 Universitat d'Alacant / Universidad de Alicante +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . + + +# AP_CHECK_LING([ID], [MONOLINGUAL_PACKAGE]) +# +# Check to see whether MONOLINGUAL_PACKAGE exists, and if so sets +# AP_LIB[ID] and AP_SRC[ID]. +# +# As an example, AP_CHECK_LING([1], [apertium-fie]) would check that +# apertium-fie exists, and set AP_LIB1 and AP_SRC1 to the paths +# containing the binaries and sources respectively of that monolingual +# language package. +# +# Also sets up options --with-lang[ID] (e.g. --with-lang1) if the user +# wants to use the source code checkout instead of installed files. +# ------------------------------------------ +AC_DEFUN([AP_CHECK_LING], +[ + AC_ARG_VAR([AP_SRC][$1], [Path to $2 sources, same as AP_LIB$1 if --with-lang$1 set]) + AC_ARG_VAR([AP_LIB][$1], [Path to $2 binaries, same as AP_SRC$1 if --with-lang$1 set]) + AC_ARG_VAR([AP_SUBDIRS], [List of all --with-lang dirs; add it to SUBDIRS to make configure-specified dependencies recursively]) + AC_ARG_WITH([lang][$1], + [dnl +AS_HELP_STRING([--with-lang][$1],dnl +[Uninstalled source directory for $2, defines AP_SRC$1 and AP_LIB$1 for Makefile, otherwise these are set to paths of installed files.]) + ], + [ + AP_LIB$1=$withval + AP_SRC$1=$withval + echo "Using $2 from $withval" + AP_SUBDIRS="$AP_SUBDIRS $withval" + ], + [ + # TODO: PKG_CHECK_MODULES sets useless variables, while _EXISTS + # doesn't error if not found, should make a PKG_CHECK macro that + # errors but does not set _CFLAGS/_LIBS + PKG_CHECK_MODULES(m4_toupper(m4_bpatsubst($2, [-], [_])), [$2]) + AP_LIB$1=`pkg-config --variable=dir $2` + AP_SRC$1=`pkg-config --variable=srcdir $2` + ]) + if test -z "$AP_SRC$1" || ! test -d "$AP_SRC$1"; then + AC_MSG_ERROR([Could not find sources dir for $2 (AP_SRC$1="$AP_SRC$1")]) + fi +]) + + +# AP_MKINCLUDE() +# +# Creates the file ap_include.am and sets the variable ap_include to +# point to this path. Now in your Makefile.am you can include +# ap_include.am by writing @ap_include@ on a line by itself. +# +# The file defines a pattern rule for making modes files, and a goal +# for installing the ones that have install="yes" in modes.xml. To +# generate modes, include a line like +# +# noinst_DATA=modes/$(PREFIX1).mode +# +# in your Makefile.am with _at most one mode_ (the others will be +# created even if you list only one, listing several will lead to +# trouble with parallell make). +# +# Install the modes by making install-data-local dependent on +# install-modes, ie. +# +# install-data-local: install-modes +# +# Also defined is a goal for making the .deps folder. If you want some +# file to be built in a folder named .deps, just make that goal +# dependent on .deps/.d, e.g. +# +# .deps/intermediate.dix: original.dix .deps/.d +# +# ------------------------------------------ +AC_DEFUN([AP_MKINCLUDE], +[ + AC_SUBST_FILE(ap_include) + ap_include=$srcdir/ap_include.am + + cat >$srcdir/ap_include.am <@/@name' modes.xml | sed 's/ *name="\(@<:@^"@:>@*\)"/\1.mode /g'\`; \\ + if test -n "\$\$modes"; then mv \$\$modes modes/; fi + +apertium_modesdir=\$(prefix)/share/apertium/modes/ +install-modes: + mv modes modes.bak + apertium-gen-modes -f modes.xml \$(prefix)/share/apertium/\$(BASENAME) + rm -rf modes + mv modes.bak modes + test -d \$(DESTDIR)\$(apertium_modesdir) || mkdir \$(DESTDIR)\$(apertium_modesdir) + modes=\`xmllint --xpath '//mode@<:@@install="yes"@:>@/@name' modes.xml | sed 's/ *name="\(@<:@^"@:>@*\)"/\1.mode /g'\`; \\ + if test -n "\$\$modes"; then \\ + \$(INSTALL_DATA) \$\$modes \$(DESTDIR)\$(apertium_modesdir); \\ + rm \$\$modes; \\ + fi + +.deps/.d: + test -d .deps || mkdir .deps + touch \$[]@ + +.PRECIOUS: .deps/.d + +langs: + @fail=; \ + if \$(am__make_keepgoing); then \ + failcom='fail=yes'; \ + else \ + failcom='exit 1'; \ + fi; \ + dot_seen=no; \ + list='\$(AP_SUBDIRS)'; \ + for subdir in \$\$list; do \ + echo "Making \$\$subdir"; \ + (\$(am__cd) \$\$subdir && \$(MAKE) \$(AM_MAKEFLAGS) all-am) \ + || eval \$\$failcom; \ + done; \ + \$(MAKE) \$(AM_MAKEFLAGS) all-am || exit 1; \ + test -z "\$\$fail" +.PHONY: langs + + +.deps/%.autobil.prefixes: %.autobil.bin .deps/.d + lt-print $< | sed 's/ /@_SPACE_@/g' > .deps/\@S|@*.autobil.att + hfst-txt2fst -e Δ < .deps/\@S|@*.autobil.att > .deps/\@S|@*.autobil.hfst + hfst-project -p upper .deps/\@S|@*.autobil.hfst > .deps/\@S|@*.autobil.upper # bidix + echo ' @<:@ ? - %+ @:>@* ' | hfst-regexp2fst > .deps/\@S|@*.any-nonplus.hfst # [^+]* + hfst-concatenate -1 .deps/\@S|@*.autobil.upper -2 .deps/\@S|@*.any-nonplus.hfst -o .deps/\@S|@*.autobil.nonplussed # bidix [^+]* + echo ' %+ ' | hfst-regexp2fst > .deps/\@S|@*.single-plus.hfst # + + hfst-concatenate -1 .deps/\@S|@*.single-plus.hfst -2 .deps/\@S|@*.autobil.nonplussed -o .deps/\@S|@*.autobil.postplus # + bidix [^+]* + hfst-repeat -f0 -t3 -i .deps/\@S|@*.autobil.postplus -o .deps/\@S|@*.autobil.postplus.0,3 # (+ bidix [^+]*){0,3} -- gives at most three + + hfst-concatenate -1 .deps/\@S|@*.autobil.nonplussed -2 .deps/\@S|@*.autobil.postplus.0,3 -o \@S|@@ # bidix [^+]* (+ bidix [^+]*){0,3} + +EOF + +]) Index: branches/apertium-tagger/apertium2/apertium.pc.in =================================================================== --- branches/apertium-tagger/apertium2/apertium.pc.in (nonexistent) +++ branches/apertium-tagger/apertium2/apertium.pc.in (revision 69632) @@ -0,0 +1,10 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: apertium +Description: rule-based machine translation system +Version: @VERSION@ +Libs: -L${libdir} -l@GENERIC_LIBRARY_NAME@@GENERIC_MAJOR_VERSION@ @APERTIUM_LIBS@ +Cflags: -I${includedir}/@GENERIC_LIBRARY_NAME@-@GENERIC_API_VERSION@ -I${libdir}/@GENERIC_LIBRARY_NAME@-@GENERIC_API_VERSION@/include @APERTIUM_CFLAGS@ Index: branches/apertium-tagger/apertium2/COPYING =================================================================== --- branches/apertium-tagger/apertium2/COPYING (nonexistent) +++ branches/apertium-tagger/apertium2/COPYING (revision 69632) @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. Index: branches/apertium-tagger/apertium2/COPYING.hunalign =================================================================== --- branches/apertium-tagger/apertium2/COPYING.hunalign (nonexistent) +++ branches/apertium-tagger/apertium2/COPYING.hunalign (revision 69632) @@ -0,0 +1,502 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! Index: branches/apertium-tagger/apertium2/README =================================================================== --- branches/apertium-tagger/apertium2/README (nonexistent) +++ branches/apertium-tagger/apertium2/README (revision 69632) @@ -0,0 +1,68 @@ +REQUIREMENTS + +* This package needs the package lttoolbox-3.3.1 installed in the +system, as well as libxml and libpcre. + +See www.apertium.org for more information on installing. + +DESCRIPTION + +When building, this package generates, among others, the following +modules: + +* apertium-deshtml, apertium-desrtf, apertium-destxt + Deformatters for html, rtf and txt document formats. + +* apertium-rehtml, apertium-rertf, apertium-retxt + Reformatters for html, rtf and txt document formats. + +* apertium + Translator program. Execute without parameters to see the usage. + +QUICK START + +1) Download the packages for lttoolbox-VERSION.tar.gz and + apertium-VERSION.tar.gz and linguistic data + + Note: If you are using the translator from SVN, run ./autogen.sh before + running ./configure in all cases. + +2) Unpack lttoolbox and do ('#' means 'do that with root privileges'): + $ cd lttoolbox-VERSION + $ ./configure + $ make + # make install + +3) Unpack apertium and do: + $ cd apertium-VERSION + $ ./configure + $ make + # make install + +4) Unpack linguistic data (LING_DATA_DIR) and do: + $ cd LING_DATA_DIR + $ ./configure + $ make + and wait for a while (minutes). + +5) Use the translator + + USAGE: apertium [-d datadir] [-f format] [-u] [in [out]] + -d datadir directory of linguistic data + -f format one of: txt (default), html, rtf, odt, docx, wxml, xlsx, pptx, + xpresstag, html-noent, latex, latex-raw + -a display ambiguity + -u don't display marks '*' for unknown words + -n don't insert period before possible sentence-ends + -m memory.tmx use a translation memory to recycle translations + -o direction translation direction using the translation memory, + by default 'direction' is used instead + -l lists the available translation directions and exits + direction typically, LANG1-LANG2, but see modes.xml in language data + in input file (stdin by default) + out output file (stdout by default) + + + Sample: + + $ apertium -f txt es-ca output Index: branches/apertium-tagger/apertium2/ChangeLog =================================================================== --- branches/apertium-tagger/apertium2/ChangeLog (nonexistent) +++ branches/apertium-tagger/apertium2/ChangeLog (revision 69632) @@ -0,0 +1,11 @@ +(See SVN for the actual ChangeLog.) + + + +Mon Jun 5 00:29:11 BST 2006 + +Initial packaging. + +Wed Oct 3 07:12:19 BST 2007 + +Packaging version 3.0. Index: branches/apertium-tagger/apertium2/autogen.sh =================================================================== --- branches/apertium-tagger/apertium2/autogen.sh (nonexistent) +++ branches/apertium-tagger/apertium2/autogen.sh (revision 69632) @@ -0,0 +1,35 @@ +#! /bin/sh + +# If the user specified a --prefix, take that, otherwise /usr/local/ +# is the default. +PREFIX=/usr/local +prefixnext=false +for i in "$@"; do + case $i in + --prefix=*) # equals separated: + PREFIX="${i#*=}" + ;; + --prefix) # space separated: + prefixnext=true + ;; + *) + $prefixnext && PREFIX="$i" && prefixnext=false + ;; + esac +done + +# Set the paths needed by libtool/pkg-config/aclocal etc. By inferring +# them based on --prefix , users don't have to edit ~/.bashrc. We only +# append, so if a user has some other preference, that will override. +PATH="${PATH}:/usr/local/bin" +export PATH +LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${PREFIX}/lib" +export LD_LIBRARY_PATH +PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:${PREFIX}/share/pkgconfig:${PREFIX}/lib/pkgconfig" +export PKG_CONFIG_PATH +ACLOCAL_PATH="${ACLOCAL_PATH}:${PREFIX}/share/aclocal" +export ACLOCAL_PATH + + +# Pass on all args to configure +autoreconf -fi && ./configure "$@" Property changes on: branches/apertium-tagger/apertium2/autogen.sh ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: branches/apertium-tagger/apertium2/README-MODES =================================================================== --- branches/apertium-tagger/apertium2/README-MODES (nonexistent) +++ branches/apertium-tagger/apertium2/README-MODES (revision 69632) @@ -0,0 +1,3 @@ +For information on modes, please see our Wiki: + + http://wiki.apertium.org/wiki/Modes Index: branches/apertium-tagger/apertium2/AUTHORS =================================================================== --- branches/apertium-tagger/apertium2/AUTHORS (nonexistent) +++ branches/apertium-tagger/apertium2/AUTHORS (revision 69632) @@ -0,0 +1,11 @@ +(c) 2005-2007 Universitat d'Alacant / Universidad de Alicante. +(c) 2007-2008 Prompsit Language Engineering S.L. + +Most of the files tmx_* are taken from the hunalign package: +(C) Copyright 2004. Media Research Centre at the +Sociology and Communications Department of the +Budapest University of Technology and Economics. + +hunalign is licensed under the GNU Lesser GPL v. 2.1, see +COPYING.hunalign for more details. + Index: branches/apertium-tagger/apertium2/cmake/CMakeUseFlex.cmake =================================================================== --- branches/apertium-tagger/apertium2/cmake/CMakeUseFlex.cmake (nonexistent) +++ branches/apertium-tagger/apertium2/cmake/CMakeUseFlex.cmake (revision 69632) @@ -0,0 +1,42 @@ +# - Look for GNU flex, the lexer generator. +# Defines the following: +# FLEX_EXECUTABLE - path to the flex executable +# FLEX_FILE - parse a file with flex +# FLEX_PREFIX_OUTPUTS - Set to true to make FLEX_FILE produce outputs of +# lex.${filename}.c, not lex.yy.c . Passes -P to flex. + +IF(NOT DEFINED FLEX_PREFIX_OUTPUTS) + SET(FLEX_PREFIX_OUTPUTS FALSE) +ENDIF(NOT DEFINED FLEX_PREFIX_OUTPUTS) + +IF(NOT FLEX_EXECUTABLE) + FIND_PROGRAM(FLEX_EXECUTABLE flex) + IF (FLEX_EXECUTABLE) + MESSAGE (STATUS "Found flex -- ${FLEX_EXECUTABLE}") + ELSE (FLEX_EXECUTABLE) + MESSAGE (ERROR "flex not found") + ENDIF(FLEX_EXECUTABLE) +ENDIF(NOT FLEX_EXECUTABLE) + +IF(FLEX_EXECUTABLE) + MACRO(FLEX OUT_FILE SWITCHES IN_FILE) + GET_FILENAME_COMPONENT(PATH "${IN_FILE}" PATH) + + IF(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${PATH}") + FILE(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${PATH}") + ENDIF(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${PATH}") + + SET(FULL_OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${PATH}${OUT_FILE}") + SET(FULL_IN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${IN_FILE}") + + ADD_CUSTOM_COMMAND( + OUTPUT "${FULL_OUT_FILE}" + COMMAND "${FLEX_EXECUTABLE}" + ARGS "${SWITCHES}" + -o"${FULL_OUT_FILE}" + "${FULL_IN_FILE}" + DEPENDS "${FULL_IN_FILE}") + + SET_SOURCE_FILES_PROPERTIES("${FULL_OUT_FILE}" PROPERTIES GENERATED TRUE) + ENDMACRO(FLEX) +ENDIF(FLEX_EXECUTABLE) Index: branches/apertium-tagger/apertium2/cmake/CMakeUseXsltproc.cmake =================================================================== --- branches/apertium-tagger/apertium2/cmake/CMakeUseXsltproc.cmake (nonexistent) +++ branches/apertium-tagger/apertium2/cmake/CMakeUseXsltproc.cmake (revision 69632) @@ -0,0 +1,35 @@ +# - Look for GNU xsltproc, the lexer generator. +# Defines the following: +# XSLTPROC_EXECUTABLE - path to the xsltproc executable +# XSLTPROC_FILE - parse a file with xsltproc +# XSLTPROC_PREFIX_OUTPUTS - Set to true to make XSLTPROC_FILE produce outputs of +# lex.${filename}.c, not lex.yy.c . Passes -P to xsltproc. + +IF(NOT XSLTPROC_EXECUTABLE) + FIND_PROGRAM(XSLTPROC_EXECUTABLE xsltproc) + IF (XSLTPROC_EXECUTABLE) + MESSAGE (STATUS "Found xsltproc -- ${XSLTPROC_EXECUTABLE}") + ELSE (XSLTPROC_EXECUTABLE) + MESSAGE (ERROR "xsltproc not found") + ENDIF(XSLTPROC_EXECUTABLE) +ENDIF(NOT XSLTPROC_EXECUTABLE) + +IF(XSLTPROC_EXECUTABLE) + MACRO(XSLTPROC OUT_FILE XSL_FILE XML_FILE) + GET_FILENAME_COMPONENT(PATH "${XSL_FILE}" PATH) + + SET(FULL_OUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${OUT_FILE}") + SET(FULL_XSL_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${XSL_FILE}") + SET(FULL_XML_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${XML_FILE}") + + ADD_CUSTOM_COMMAND( + OUTPUT "${FULL_OUT_FILE}" + COMMAND "${XSLTPROC_EXECUTABLE}" + ARGS -o "${FULL_OUT_FILE}" + "${FULL_XSL_FILE}" + "${FULL_XML_FILE}" + DEPENDS "${FULL_XSL_FILE}" "${FULL_XML_FILE}") + + SET_SOURCE_FILES_PROPERTIES("${FULL_OUT_FILE}" PROPERTIES GENERATED TRUE) + ENDMACRO(XSLTPROC) +ENDIF(XSLTPROC_EXECUTABLE) Index: branches/apertium-tagger/apertium2/cmake/FindLibPcre.cmake =================================================================== --- branches/apertium-tagger/apertium2/cmake/FindLibPcre.cmake (nonexistent) +++ branches/apertium-tagger/apertium2/cmake/FindLibPcre.cmake (revision 69632) @@ -0,0 +1,38 @@ +IF (LIBPCRE_INCLUDE_DIR AND LIBPCRE_LIBRARIES) + # in cache already + SET(LibPcre_FIND_QUIETLY TRUE) +ENDIF (LIBPCRE_INCLUDE_DIR AND LIBPCRE_LIBRARIES) + +IF (NOT WIN32) + # use pkg-config to get the directories and then use these values + # in the FIND_PATH() and FIND_LIBRARY() calls + INCLUDE(UsePkgConfig) + PKGCONFIG(libpcre LIBPCRE_INCLUDES LIBPCRE_LIB_DIR LIBPCRE_LDFLAGS LIBPCRE_CFLAGS) + SET(LIBPCRE_DEFINITIONS ${LIBPCRE_CFLAGS}) +ENDIF (NOT WIN32) + +FIND_PATH(LIBPCRE_INCLUDE_DIR pcre.h + PATHS ${LIBPCRE_INCLUDES}) + +FIND_LIBRARY(LIBPCRE_LIBRARIES + NAMES pcre libpcre + PATHS ${LIBPCRE_LIB_DIR}) + +IF (LIBPCRE_INCLUDE_DIR AND LIBPCRE_LIBRARIES) + SET(LIBPCRE_FOUND TRUE) +ELSE (LIBPCRE_INCLUDE_DIR AND LIBPCRE_LIBRARIES) + SET(LIBPCRE_FOUND FALSE) +ENDIF (LIBPCRE_INCLUDE_DIR AND LIBPCRE_LIBRARIES) + +IF (LIBPCRE_FOUND) + IF (NOT LibPcre_FIND_QUIETLY) + MESSAGE(STATUS "Found LibPcre: ${LIBPCRE_LIBRARIES}") + ENDIF (NOT LibPcre_FIND_QUIETLY) +ELSE (LIBPCRE_FOUND) + IF (LibPcre_FIND_REQUIRED) + MESSAGE(SEND_ERROR "Could NOT find LibPcre") + ENDIF (LibPcre_FIND_REQUIRED) +ENDIF (LIBPCRE_FOUND) + +MARK_AS_ADVANCED(LIBPCRE_INCLUDE_DIR LIBPCRE_LIBRARIES) + Index: branches/apertium-tagger/apertium2/cmake/FindLibXml2.cmake =================================================================== --- branches/apertium-tagger/apertium2/cmake/FindLibXml2.cmake (nonexistent) +++ branches/apertium-tagger/apertium2/cmake/FindLibXml2.cmake (revision 69632) @@ -0,0 +1,59 @@ +# - Try to find LibXml2 +# Once done this will define +# +# LIBXML2_FOUND - system has LibXml2 +# LIBXML2_INCLUDE_DIR - the LibXml2 include directory +# LIBXML2_LIBRARIES - the libraries needed to use LibXml2 +# LIBXML2_DEFINITIONS - Compiler switches required for using LibXml2 +# +# Copyright (c) 2006, Alexander Neundorf +# This code is available under the BSD license, see licenses/BSD for details. + +# Copyright (c) 2006, Alexander Neundorf, +# +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. + + +IF (LIBXML2_INCLUDE_DIR AND LIBXML2_LIBRARIES) + # in cache already + SET(LibXml2_FIND_QUIETLY TRUE) +ENDIF (LIBXML2_INCLUDE_DIR AND LIBXML2_LIBRARIES) + +IF (NOT WIN32) + # use pkg-config to get the directories and then use these values + # in the FIND_PATH() and FIND_LIBRARY() calls + INCLUDE(UsePkgConfig) + PKGCONFIG(libxml-2.0 _LibXml2IncDir _LibXml2LinkDir _LibXml2LinkFlags _LibXml2Cflags) + SET(LIBXML2_DEFINITIONS ${_LibXml2Cflags}) +ENDIF (NOT WIN32) + +FIND_PATH(LIBXML2_INCLUDE_DIR libxml/xpath.h + PATHS + ${_LibXml2IncDir} + PATH_SUFFIXES libxml2 + ) + +FIND_LIBRARY(LIBXML2_LIBRARIES NAMES xml2 libxml2 + PATHS + ${_LibXml2LinkDir} + ) + +IF (LIBXML2_INCLUDE_DIR AND LIBXML2_LIBRARIES) + SET(LIBXML2_FOUND TRUE) +ELSE (LIBXML2_INCLUDE_DIR AND LIBXML2_LIBRARIES) + SET(LIBXML2_FOUND FALSE) +ENDIF (LIBXML2_INCLUDE_DIR AND LIBXML2_LIBRARIES) + +IF (LIBXML2_FOUND) + IF (NOT LibXml2_FIND_QUIETLY) + MESSAGE(STATUS "Found LibXml2: ${LIBXML2_LIBRARIES}") + ENDIF (NOT LibXml2_FIND_QUIETLY) +ELSE (LIBXML2_FOUND) + IF (LibXml2_FIND_REQUIRED) + MESSAGE(SEND_ERROR "Could NOT find LibXml2") + ENDIF (LibXml2_FIND_REQUIRED) +ENDIF (LIBXML2_FOUND) + +MARK_AS_ADVANCED(LIBXML2_INCLUDE_DIR LIBXML2_LIBRARIES) + Index: branches/apertium-tagger/apertium2/cmake/FindLttoolbox3.cmake =================================================================== --- branches/apertium-tagger/apertium2/cmake/FindLttoolbox3.cmake (nonexistent) +++ branches/apertium-tagger/apertium2/cmake/FindLttoolbox3.cmake (revision 69632) @@ -0,0 +1,57 @@ +# - Try to find Lttoolbox3 +# Once done this will define +# +# LTTOOLBOX3_FOUND - system has Lttoolbox3 +# LTTOOLBOX3_INCLUDE_DIR - the Lttoolbox3 include directory +# LTTOOLBOX3_LIBRARIES - the libraries needed to use Lttoolbox3 +# LTTOOLBOX3_DEFINITIONS - Compiler switches required for using Lttoolbox3 +# +# Copyright (c) 2006, Alexander Neundorf +# This code is available under the BSD license, see licenses/BSD for details. + +# Copyright (c) 2006, Alexander Neundorf, +# +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. +# +# This is derived from FindLibXml2.cmake + +IF (LTTOOLBOX3_INCLUDE_DIR AND LTTOOLBOX3_LIBRARIES) + # in cache already + SET(Lttoolbox3_FIND_QUIETLY TRUE) +ENDIF (LTTOOLBOX3_INCLUDE_DIR AND LTTOOLBOX3_LIBRARIES) + +IF (NOT WIN32) + # use pkg-config to get the directories and then use these values + # in the FIND_PATH() and FIND_LIBRARY() calls + INCLUDE(UsePkgConfig) + PKGCONFIG(lttoolbox-3.0 LTTOOLBOX3_INCLUDES LTTOOLBOX3_LIB_DIR LTTOOLBOX3_LDFLAGS LTTOOLBOX3_CFLAGS) + SET(LTTOOLBOX3_DEFINITIONS ${LTTOOLBOX3_CFLAGS}) +ENDIF (NOT WIN32) + +FIND_PATH(LTTOOLBOX3_INCLUDE_DIR lttoolbox/alphabet.h + PATHS ${LTTOOLBOX3_INCLUDES} + PATH_SUFFIXES lttoolbox-3.0) + +FIND_LIBRARY(LTTOOLBOX3_LIBRARIES + NAMES lttoolbox3 + PATHS ${LTTOOLBOX3_LIB_DIR}) + +IF (LTTOOLBOX3_INCLUDE_DIR AND LTTOOLBOX3_LIBRARIES) + SET(LTTOOLBOX3_FOUND TRUE) +ELSE (LTTOOLBOX3_INCLUDE_DIR AND LTTOOLBOX3_LIBRARIES) + SET(LTTOOLBOX3_FOUND FALSE) +ENDIF (LTTOOLBOX3_INCLUDE_DIR AND LTTOOLBOX3_LIBRARIES) + +IF (LTTOOLBOX3_FOUND) + IF (NOT Lttoolbox3_FIND_QUIETLY) + MESSAGE(STATUS "Found Lttoolbox3: ${LTTOOLBOX3_LIBRARIES}") + ENDIF (NOT Lttoolbox3_FIND_QUIETLY) +ELSE (LTTOOLBOX3_FOUND) + IF (Lttoolbox3_FIND_REQUIRED) + MESSAGE(SEND_ERROR "Could NOT find Lttoolbox3") + ENDIF (Lttoolbox3_FIND_REQUIRED) +ENDIF (LTTOOLBOX3_FOUND) + +MARK_AS_ADVANCED(LTTOOLBOX3_INCLUDE_DIR LTTOOLBOX3_LIBRARIES) + Index: branches/apertium-tagger/apertium2 =================================================================== --- branches/apertium-tagger/apertium2 (nonexistent) +++ branches/apertium-tagger/apertium2 (revision 69632) Property changes on: branches/apertium-tagger/apertium2 ___________________________________________________________________ Added: svn:ignore ## -0,0 +1,16 ## +autom4te.cache +Makefile +Makefile.in +missing +configure +config.sub +config.status +config.log +config.guess +aclocal.m4 +*.pc +depcomp +install-sh +libtool +ltmain.sh +compile