commit 985127e770de9cc4f09c7f485db19cd6ae031be8 Author: Daniel Swanson Date: Fri Jun 11 13:23:40 2021 -0500 unbundle utfcpp and drop old utf_converter code diff --git a/Makefile.am b/Makefile.am index 444db77..dc6873f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,7 +15,7 @@ if HAVE_PYTHON_BINDINGS SUBDIRS += python endif -EXTRA_DIST=autogen.sh README-MODES apertium.m4 utf8 tests +EXTRA_DIST=autogen.sh README-MODES apertium.m4 tests install-data-local: mkdir -p $(DESTDIR)$(modesdir) diff --git a/apertium/Makefile.am b/apertium/Makefile.am index 494cfd4..e3cecc2 100644 --- a/apertium/Makefile.am +++ b/apertium/Makefile.am @@ -75,7 +75,6 @@ h_sources = a.h \ ttag.h \ unigram_tagger.h \ unlocked_cstdio.h \ - utf_converter.h \ utils.h \ xml_reader.h \ xml_walk_util.h @@ -140,7 +139,6 @@ cc_sources = a.cc \ trx_reader.cc \ tsx_reader.cc \ unigram_tagger.cc \ - utf_converter.cc \ xml_reader.cc \ xml_walk_util.cc @@ -303,7 +301,7 @@ apertium_gen_modes_SOURCES = gen_modes.cc apertium_gen_modes_LDADD = -lapertium$(VERSION_MAJOR) $(lib_LTLIBRARIES) if WINDOWS -AM_CPPFLAGS = -I$(top_srcdir)/utf8 -I$(top_srcdir)/apertium/win32 -I$(top_srcdir) $(APERTIUM_CFLAGS) $(ICU_CFLAGS) +AM_CPPFLAGS = -I$(top_srcdir)/apertium/win32 -I$(top_srcdir) $(APERTIUM_CFLAGS) $(ICU_CFLAGS) else AM_CPPFLAGS = -I$(top_srcdir) $(APERTIUM_CFLAGS) $(ICU_CFLAGS) endif diff --git a/apertium/adapt_docx.cc b/apertium/adapt_docx.cc index 4699a0c..91f4937 100644 --- a/apertium/adapt_docx.cc +++ b/apertium/adapt_docx.cc @@ -30,7 +30,7 @@ #include #endif -#include "utf8/utf8.h" +#include #include "unicode/uchar.h" using namespace std; diff --git a/apertium/apertium-postlatex.l b/apertium/apertium-postlatex.l index 4cf5edf..39bea76 100644 --- a/apertium/apertium-postlatex.l +++ b/apertium/apertium-postlatex.l @@ -23,7 +23,7 @@ extern "C" { #include "apertium_config.h" #endif #include -#include +#include #ifdef _WIN32 #include #include diff --git a/apertium/apertium_pretransfer.cc b/apertium/apertium_pretransfer.cc index b67fac8..25bccc0 100644 --- a/apertium/apertium_pretransfer.cc +++ b/apertium/apertium_pretransfer.cc @@ -103,7 +103,7 @@ int main(int argc, char *argv[]) if(!input.open(argv[argc-1])) { usage(argv[0]); } - u_finit(stdout, NULL, NULL); + output = u_finit(stdout, NULL, NULL); } else { diff --git a/apertium/apertium_tmxbuild.cc b/apertium/apertium_tmxbuild.cc index 06588d8..55f81b3 100644 --- a/apertium/apertium_tmxbuild.cc +++ b/apertium/apertium_tmxbuild.cc @@ -24,7 +24,6 @@ #include #include -#include #include #include "apertium_config.h" #include diff --git a/apertium/deformat.xsl b/apertium/deformat.xsl index 56c2aa6..6e71285 100644 --- a/apertium/deformat.xsl +++ b/apertium/deformat.xsl @@ -168,7 +168,7 @@ extern "C" { #ifndef GENFORMAT #include "apertium_config.h" #endif -#include <utf8/utf8.h> +#include <utf8.h> #include <apertium/unlocked_cstdio.h> #ifdef _WIN32 #include <io.h> diff --git a/apertium/exception_type.cc b/apertium/exception_type.cc index c83dc3f..7c1eec8 100644 --- a/apertium/exception_type.cc +++ b/apertium/exception_type.cc @@ -15,7 +15,6 @@ #include "exception_type.h" -#include "utf_converter.h" #include #include diff --git a/apertium/gen_modes.cc b/apertium/gen_modes.cc index 1dc7f8f..0a4f07f 100644 --- a/apertium/gen_modes.cc +++ b/apertium/gen_modes.cc @@ -22,7 +22,6 @@ #include #include #include "string_utils.h" -#include "utf_converter.h" #include #include #include diff --git a/apertium/mtx_reader.cc b/apertium/mtx_reader.cc index 159e8cc..70f85de 100644 --- a/apertium/mtx_reader.cc +++ b/apertium/mtx_reader.cc @@ -22,8 +22,6 @@ #include #include -#include // TODO - #include #include #include diff --git a/apertium/perceptron_spec.cc b/apertium/perceptron_spec.cc index 378d22b..b70560a 100644 --- a/apertium/perceptron_spec.cc +++ b/apertium/perceptron_spec.cc @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace Apertium { diff --git a/apertium/reformat.xsl b/apertium/reformat.xsl index 0fca2bc..07ad1b2 100644 --- a/apertium/reformat.xsl +++ b/apertium/reformat.xsl @@ -26,7 +26,7 @@ #ifndef GENFORMAT #include "apertium_config.h" #endif -#include <utf8/utf8.h> +#include <utf8.h> #include <apertium/unlocked_cstdio.h> #include <cstdlib> diff --git a/apertium/tagger_utils.cc b/apertium/tagger_utils.cc index 7a1bcf6..0e8738d 100644 --- a/apertium/tagger_utils.cc +++ b/apertium/tagger_utils.cc @@ -23,18 +23,6 @@ #include #include #include -#ifdef _MSC_VER -#define wcstok wcstok_s -#endif -#ifdef __MINGW32__ - -wchar_t *_wcstok(wchar_t *wcs, const wchar_t *delim, wchar_t **ptr) { - (void)ptr; - return wcstok(wcs, delim); -} - -#define wcstok _wcstok -#endif using namespace Apertium; @@ -75,25 +63,6 @@ int tagger_utils::ntokens_multiword(UString const &s) } } return n; - /* - wchar_t *news = new wchar_t[s.size()+1]; - wcscpy(news, s.c_str()); - news[s.size()] = 0; - cerr << news << endl; - - wchar_t const *delim = "_"; - wchar_t *ptr; - int n=0; - - if (wcstok(news, delim, &ptr)) - n++; - while (wcstok(NULL, delim, &ptr)) - n++; - - delete[] news; - - return n; - */ } int tagger_utils::nguiones_fs(UString const & s) { @@ -105,24 +74,6 @@ int tagger_utils::nguiones_fs(UString const & s) { } } return n; - /* - UChar *news = new UChar[s.size()+1]; - wcscpy(news, s.c_str()); - news[s.size()] = 0; - cerr << news << endl; - wchar_t const *delim = "-"; - wchar_t *ptr; - int n=0; - - if (wcstok(news, delim, &ptr)) - n++; - while (wcstok(NULL, delim, &ptr)) - n++; - - delete[] news; - - return n; - */ } UString tagger_utils::trim(UString s) @@ -200,7 +151,6 @@ set & tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { set &ret = td.getOpenClass(); Collection &output = td.getOutput(); - int ret_idx = output[ret]; for (int k=0; k &ambg_class = output[k]; @@ -209,7 +159,6 @@ tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { continue; } if (includes(ambg_class.begin(), ambg_class.end(), c.begin(), c.end())) { - ret_idx = k; ret = ambg_class; } } @@ -304,4 +253,3 @@ ostream& operator<< (ostream& os, const set& s) { os<<'}'; return os; } - diff --git a/apertium/tagger_word.cc b/apertium/tagger_word.cc index cb593b5..83f15a6 100644 --- a/apertium/tagger_word.cc +++ b/apertium/tagger_word.cc @@ -15,7 +15,6 @@ * along with this program; if not, see . */ #include -#include #include #include "apertium_config.h" #include diff --git a/apertium/tmx_builder.cc b/apertium/tmx_builder.cc index d87eb7a..b535b3b 100644 --- a/apertium/tmx_builder.cc +++ b/apertium/tmx_builder.cc @@ -15,7 +15,6 @@ * along with this program; if not, see . */ #include -#include #include #include #include diff --git a/apertium/transfer_data.cc b/apertium/transfer_data.cc index f350891..d5e2b6c 100644 --- a/apertium/transfer_data.cc +++ b/apertium/transfer_data.cc @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/apertium/transfer_mult.cc b/apertium/transfer_mult.cc index a163430..3f00496 100644 --- a/apertium/transfer_mult.cc +++ b/apertium/transfer_mult.cc @@ -18,17 +18,12 @@ #include #include #include -#include #include #include #include #include -#ifdef _WIN32 -#include -#endif - using namespace std; void diff --git a/apertium/utf_converter.cc b/apertium/utf_converter.cc deleted file mode 100644 index e721152..0000000 --- a/apertium/utf_converter.cc +++ /dev/null @@ -1,613 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ -#include -#include -#include -#include - -using namespace Apertium; - -#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD -#define UNI_MAX_BMP (UTF32)0x0000FFFF -#define UNI_MAX_UTF16 (UTF32)0x0010FFFF -#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF -#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF -#define UNI_SUR_HIGH_START (UTF32)0xD800 -#define UNI_SUR_HIGH_END (UTF32)0xDBFF -#define UNI_SUR_LOW_START (UTF32)0xDC00 -#define UNI_SUR_LOW_END (UTF32)0xDFFF - -using namespace std; - -namespace UtfConverter -{ - - typedef unsigned int UTF32; /* at least 32 bits */ - typedef unsigned short UTF16; /* at least 16 bits */ - typedef unsigned char UTF8; /* typically 8 bits */ - - /* Some fundamental constants */ - - typedef enum { - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ - } ConversionResult; - - typedef enum { - strictConversion = 0, - lenientConversion - } ConversionFlags; - - static const int halfShift = 10; /* used for shifting by 10 bits */ - - static const UTF32 halfBase = 0x0010000UL; - static const UTF32 halfMask = 0x3FFUL; - - - void conversionError() - { - cerr << "Error: conversion error" << endl; - exit(EXIT_FAILURE); - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - } - ch = *source++; - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_LEGAL_UTF32) { - if (flags == strictConversion) { - result = sourceIllegal; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - --source; /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } - *sourceStart = source; - *targetStart = target; - return result; - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF32* target = *targetStart; - UTF32 ch, ch2; - while (source < sourceEnd) { - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - if (target >= targetEnd) { - source = oldSource; /* Back up source pointer! */ - result = targetExhausted; break; - } - *target++ = ch; - } - *sourceStart = source; - *targetStart = target; - - return result; - } - - /* --------------------------------------------------------------------- */ - - /* - * Index into the table below with the first byte of a UTF-8 sequence to - * get the number of trailing bytes that are supposed to follow it. - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is - * left as-is for anyone who may want to do such conversion, which was - * allowed in earlier algorithms. - */ - static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 - }; - - /* - * Magic values subtracted from a buffer value during UTF8 conversion. - * This table contains as many values as there might be trailing bytes - * in a UTF-8 sequence. - */ - static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - - /* - * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed - * into the first byte, depending on how many bytes follow. There are - * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequencs - * for *legal* UTF-8 will be 4 or fewer bytes total. - */ - static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - - /* --------------------------------------------------------------------- */ - - /* The interface converts a whole buffer to avoid function-call overhead. - * Constants have been gathered. Loops & conditionals have been removed as - * much as possible for efficiency, in favor of drop-through switches. - * (See "Note A" at the bottom of the file for equivalent code.) - * If your compiler supports it, the "isLegalUTF8" call can be turned - * into an inline function. - */ - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - UTF32 ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } - - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; - } - - /* --------------------------------------------------------------------- */ - - /* - * Utility routine to tell whether a sequence of bytes is legal UTF-8. - * This must be called with the length pre-determined by the first byte. - * If not calling this from ConvertUTF8to*, then the length can be set by: - * length = trailingBytesForUTF8[*source]+1; - * and the sequence is illegal right away if there aren't that many bytes - * available. - * If presented with a length > 4, this returns false. The Unicode - * definition of UTF-8 goes up to 4-byte sequences. - */ - - static bool isLegalUTF8(const UTF8 *source, int length) { - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { - default: return false; - /* Everything else falls through when "true"... */ - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) > 0xBF) return false; - - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - if (*source > 0xF4) return false; - return true; - } - - /* --------------------------------------------------------------------- */ - - /* - * Exported function to return whether a UTF-8 sequence is legal or not. - * This is not used here; it's just exported. - */ - bool isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { - int length = trailingBytesForUTF8[*source]+1; - if (source+length > sourceEnd) { - return false; - } - return isLegalUTF8(source, length); - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - result = sourceIllegal; - source -= (extraBytesToRead+1); /* return to the start */ - break; /* Bail out; shouldn't continue */ - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } - *sourceStart = source; - *targetStart = target; - return result; - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - if (flags == strictConversion ) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* - * Figure out how many bytes the result will require. Turn any - * illegally large UTF32 things (> Plane 17) into replacement chars. - */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - result = sourceIllegal; - } - - target += bytesToWrite; - if (target > targetEnd) { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; - } - - /* --------------------------------------------------------------------- */ - - ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up the source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_LEGAL_UTF32) { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = ch; - } - } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; - } - } - *sourceStart = source; - *targetStart = target; - return result; - } - - wstring fromUtf8(string const & utf8string) - { - size_t widesize = utf8string.length(); - if (sizeof(wchar_t) == 2) - { - wstring resultstring; - resultstring.resize(widesize+1, L'\0'); - const UTF8* sourcestart = reinterpret_cast(utf8string.c_str()); - const UTF8* sourceend = sourcestart + widesize; - UTF16* targetstart = reinterpret_cast(&resultstring[0]); - UTF16* targetend = targetstart + widesize; - ConversionResult res = ConvertUTF8toUTF16(&sourcestart, sourceend, &targetstart, targetend, strictConversion); - if (res != conversionOK) - { - conversionError(); - } - *targetstart = 0; - return resultstring.substr(0, wcslen(resultstring.c_str())); - } - else if (sizeof(wchar_t) == 4) - { - wstring resultstring; - resultstring.resize(widesize+1, L'\0'); - const UTF8* sourcestart = reinterpret_cast(utf8string.c_str()); - const UTF8* sourceend = sourcestart + widesize; - UTF32* targetstart = reinterpret_cast(&resultstring[0]); - UTF32* targetend = targetstart + widesize; - ConversionResult res = ConvertUTF8toUTF32(&sourcestart, sourceend, &targetstart, targetend, strictConversion); - if (res != conversionOK) - { - conversionError(); - } - *targetstart = 0; - return resultstring.substr(0,wcslen(resultstring.c_str())); - } - else - { - conversionError(); - } - return L""; - } - - string toUtf8(wstring const &widestring) - { - size_t widesize = widestring.length(); - - if (sizeof(wchar_t) == 2) - { - size_t utf8size = 3 * widesize + 1; - string resultstring; - resultstring.resize(utf8size, '\0'); - const UTF16* sourcestart = reinterpret_cast(widestring.c_str()); - const UTF16* sourceend = sourcestart + widesize; - UTF8* targetstart = reinterpret_cast(&resultstring[0]); - UTF8* targetend = targetstart + utf8size; - ConversionResult res = ConvertUTF16toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); - if (res != conversionOK) - { - conversionError(); - } - *targetstart = 0; - return resultstring.substr(0, strlen(resultstring.c_str())); - } - else if (sizeof(wchar_t) == 4) - { - size_t utf8size = 4 * widesize + 1; - string resultstring; - resultstring.resize(utf8size, '\0'); - const UTF32* sourcestart = reinterpret_cast(widestring.c_str()); - const UTF32* sourceend = reinterpret_cast(widestring.c_str() + widesize); - UTF8* targetstart = reinterpret_cast(&resultstring[0]); - UTF8* targetend = targetstart + utf8size; - ConversionResult res = ConvertUTF32toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); - if (res != conversionOK) - { - conversionError(); - } - *targetstart = 0; - return resultstring.substr(0, strlen(resultstring.c_str())); - } - else - { - conversionError(); - } - return ""; - } -} diff --git a/apertium/utf_converter.h b/apertium/utf_converter.h deleted file mode 100644 index 5e1f5b3..0000000 --- a/apertium/utf_converter.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ -#ifndef _UTFCONVERTER_ -#define _UTFCONVERTER_ - -#include - -using namespace std; - -namespace UtfConverter -{ - wstring fromUtf8(string const &utf8string); - string toUtf8(wstring const &widestring); -} - -#endif diff --git a/apertium/xml_reader.h b/apertium/xml_reader.h index 3ad28c9..a1c7028 100644 --- a/apertium/xml_reader.h +++ b/apertium/xml_reader.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include diff --git a/configure.ac b/configure.ac index 7965f13..b71d6dd 100644 --- a/configure.ac +++ b/configure.ac @@ -80,6 +80,7 @@ AC_LANG(C++) AC_HEADER_STDC AC_CHECK_HEADERS([stdlib.h string.h unistd.h stddef.h filesystem string_view]) AC_CHECK_LIB([stdc++fs], [_ZNSt12experimental10filesystem2v112current_pathEv]) +AC_CHECK_HEADER([utf8.h], [], [AC_MSG_ERROR([You don't have utfcpp installed.])]) AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, getopt, getopt_long]) AC_CHECK_FUNCS([setlocale strdup getopt snprintf]) diff --git a/python/setup.py.in b/python/setup.py.in index 10d57cf..b5ed70a 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -23,7 +23,7 @@ def get_sources(): # interchunk.cc postchunk.cc transfer.cc 'apertium_re.cc', 'interchunk.cc', 'interchunk_word.cc', 'postchunk.cc', 'string_utils.cc', 'transfer.cc', 'transfer_data.cc', 'transfer_instr.cc', 'transfer_mult.cc', 'transfer_token.cc', 'transfer_word.cc', - 'trx_reader.cc', 'utf_converter.cc', 'xml_reader.cc', + 'trx_reader.cc', 'xml_reader.cc', # 'pretransfer.cc' 'pretransfer.cc', # tagger.cc diff --git a/tests/tagger/test_find_similar_ambiguity_classes.cc b/tests/tagger/test_find_similar_ambiguity_classes.cc index 8178788..938ac3c 100644 --- a/tests/tagger/test_find_similar_ambiguity_classes.cc +++ b/tests/tagger/test_find_similar_ambiguity_classes.cc @@ -1,5 +1,4 @@ #include -#include "apertium/utf_converter.h" #include "apertium/tagger_utils.h" #include "apertium/tagger_data_hmm.h" #include "apertium/tagger_data.h" diff --git a/utf8/utf8.h b/utf8/utf8.h deleted file mode 100644 index 82b13f5..0000000 --- a/utf8/utf8.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "utf8/checked.h" -#include "utf8/unchecked.h" - -#endif // header guard diff --git a/utf8/utf8/checked.h b/utf8/utf8/checked.h deleted file mode 100644 index 1331155..0000000 --- a/utf8/utf8/checked.h +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" -#include - -namespace utf8 -{ - // Base for the exceptions that may be thrown from the library - class exception : public ::std::exception { - }; - - // Exceptions that may be thrown from the library functions. - class invalid_code_point : public exception { - uint32_t cp; - public: - invalid_code_point(uint32_t cp) : cp(cp) {} - virtual const char* what() const throw() { return "Invalid code point"; } - uint32_t code_point() const {return cp;} - }; - - class invalid_utf8 : public exception { - uint8_t u8; - public: - invalid_utf8 (uint8_t u) : u8(u) {} - virtual const char* what() const throw() { return "Invalid UTF-8"; } - uint8_t utf8_octet() const {return u8;} - }; - - class invalid_utf16 : public exception { - uint16_t u16; - public: - invalid_utf16 (uint16_t u) : u16(u) {} - virtual const char* what() const throw() { return "Invalid UTF-16"; } - uint16_t utf16_word() const {return u16;} - }; - - class not_enough_room : public exception { - public: - virtual const char* what() const throw() { return "Not enough space"; } - }; - - /// The library API - functions intended to be called by the users - - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (!utf8::internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); - - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - throw not_enough_room(); - case internal::INVALID_LEAD: - out = utf8::append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } - - template - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); - return utf8::replace_invalid(start, end, out, replacement_marker); - } - - template - uint32_t next(octet_iterator& it, octet_iterator end) - { - uint32_t cp = 0; - internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); - switch (err_code) { - case internal::UTF8_OK : - break; - case internal::NOT_ENOUGH_ROOM : - throw not_enough_room(); - case internal::INVALID_LEAD : - case internal::INCOMPLETE_SEQUENCE : - case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(*it); - case internal::INVALID_CODE_POINT : - throw invalid_code_point(cp); - } - return cp; - } - - template - uint32_t peek_next(octet_iterator it, octet_iterator end) - { - return utf8::next(it, end); - } - - template - uint32_t prior(octet_iterator& it, octet_iterator start) - { - // can't do much if it == start - if (it == start) - throw not_enough_room(); - - octet_iterator end = it; - // Go back until we hit either a lead octet or start - while (utf8::internal::is_trail(*(--it))) - if (it == start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - return utf8::peek_next(it, end); - } - - /// Deprecated in versions that include "prior" - template - uint32_t previous(octet_iterator& it, octet_iterator pass_start) - { - octet_iterator end = it; - while (utf8::internal::is_trail(*(--it))) - if (it == pass_start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - octet_iterator temp = it; - return utf8::next(temp, end); - } - - template - void advance (octet_iterator& it, distance_type n, octet_iterator end) - { - for (distance_type i = 0; i < n; ++i) - utf8::next(it, end); - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::next(first, last); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - if (start != end) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - if (utf8::internal::is_trail_surrogate(trail_surrogate)) - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - else - throw invalid_utf16(static_cast(trail_surrogate)); - } - else - throw invalid_utf16(static_cast(cp)); - - } - // Lone trail surrogate - else if (utf8::internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast(cp)); - - result = utf8::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start != end) { - uint32_t cp = utf8::next(start, end); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start != end) - (*result++) = utf8::next(start, end); - - return result; - } - - // The iterator class - template - class iterator : public std::iterator { - octet_iterator it; - octet_iterator range_start; - octet_iterator range_end; - public: - iterator () {} - explicit iterator (const octet_iterator& octet_it, - const octet_iterator& range_start, - const octet_iterator& range_end) : - it(octet_it), range_start(range_start), range_end(range_end) - { - if (it < range_start || it > range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); - } - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::next(temp, range_end); - } - bool operator == (const iterator& rhs) const - { - if (range_start != rhs.range_start || range_end != rhs.range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - utf8::next(it, range_end); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - utf8::next(it, range_end); - return temp; - } - iterator& operator -- () - { - utf8::prior(it, range_start); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::prior(it, range_start); - return temp; - } - }; // class iterator - -} // namespace utf8 - -#endif //header guard - - diff --git a/utf8/utf8/core.h b/utf8/utf8/core.h deleted file mode 100644 index f85081f..0000000 --- a/utf8/utf8/core.h +++ /dev/null @@ -1,329 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include - -namespace utf8 -{ - // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers - // You may need to change them to match your system. - // These typedefs have the same names as ones from cstdint, or boost/cstdint - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; - -// Helper code - not intended to be directly called by the library users. May be changed at any time -namespace internal -{ - // Unicode constants - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - const uint16_t LEAD_SURROGATE_MIN = 0xd800u; - const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; - const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); - const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; - - // Maximum valid value for a Unicode code point - const uint32_t CODE_POINT_MAX = 0x0010ffffu; - - template - inline uint8_t mask8(octet_type oc) - { - return static_cast(0xff & oc); - } - template - inline uint16_t mask16(u16_type oc) - { - return static_cast(0xffff & oc); - } - template - inline bool is_trail(octet_type oc) - { - return ((utf8::internal::mask8(oc) >> 6) == 0x2); - } - - template - inline bool is_lead_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); - } - - template - inline bool is_trail_surrogate(u16 cp) - { - return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template - inline bool is_code_point_valid(u32 cp) - { - return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); - } - - template - inline typename std::iterator_traits::difference_type - sequence_length(octet_iterator lead_it) - { - uint8_t lead = utf8::internal::mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; - } - - template - inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) - { - if (cp < 0x80) { - if (length != 1) - return true; - } - else if (cp < 0x800) { - if (length != 2) - return true; - } - else if (cp < 0x10000) { - if (length != 3) - return true; - } - - return false; - } - - enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; - - /// Helper for get_sequence_x - template - utf_error increase_safely(octet_iterator& it, octet_iterator end) - { - if (++it == end) - return NOT_ENOUGH_ROOM; - - if (!utf8::internal::is_trail(*it)) - return INCOMPLETE_SEQUENCE; - - return UTF8_OK; - } - - #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} - - /// get_sequence_x functions decode utf-8 sequences of the length x - template - utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - return UTF8_OK; - } - - template - utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); - - return UTF8_OK; - } - - template - utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - template - utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR - - template - utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - octet_iterator original_it = it; - - uint32_t cp = 0; - // Determine the sequence length based on the lead octet - typedef typename std::iterator_traits::difference_type octet_difference_type; - const octet_difference_type length = utf8::internal::sequence_length(it); - - // Get trail octets and calculate the code point - utf_error err = UTF8_OK; - switch (length) { - case 0: - return INVALID_LEAD; - case 1: - err = utf8::internal::get_sequence_1(it, end, cp); - break; - case 2: - err = utf8::internal::get_sequence_2(it, end, cp); - break; - case 3: - err = utf8::internal::get_sequence_3(it, end, cp); - break; - case 4: - err = utf8::internal::get_sequence_4(it, end, cp); - break; - } - - if (err == UTF8_OK) { - // Decoding succeeded. Now, security checks... - if (utf8::internal::is_code_point_valid(cp)) { - if (!utf8::internal::is_overlong_sequence(cp, length)){ - // Passed! Return here. - code_point = cp; - ++it; - return UTF8_OK; - } - else - err = OVERLONG_SEQUENCE; - } - else - err = INVALID_CODE_POINT; - } - - // Failure branch - restore the original value of the iterator - it = original_it; - return err; - } - - template - inline utf_error validate_next(octet_iterator& it, octet_iterator end) { - uint32_t ignored; - return utf8::internal::validate_next(it, end, ignored); - } - -} // namespace internal - - /// The library API - functions intended to be called by the users - - // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; - - template - octet_iterator find_invalid(octet_iterator start, octet_iterator end) - { - octet_iterator result = start; - while (result != end) { - utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); - if (err_code != internal::UTF8_OK) - return result; - } - return result; - } - - template - inline bool is_valid(octet_iterator start, octet_iterator end) - { - return (utf8::find_invalid(start, end) == end); - } - - template - inline bool starts_with_bom (octet_iterator it, octet_iterator end) - { - return ( - ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && - ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && - ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) - ); - } - - //Deprecated in release 2.3 - template - inline bool is_bom (octet_iterator it) - { - return ( - (utf8::internal::mask8(*it++)) == bom[0] && - (utf8::internal::mask8(*it++)) == bom[1] && - (utf8::internal::mask8(*it)) == bom[2] - ); - } -} // namespace utf8 - -#endif // header guard - - diff --git a/utf8/utf8/unchecked.h b/utf8/utf8/unchecked.h deleted file mode 100644 index 989ccef..0000000 --- a/utf8/utf8/unchecked.h +++ /dev/null @@ -1,228 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" - -namespace utf8 -{ - namespace unchecked - { - template - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (cp < 0x80) // one octet - *(result++) = static_cast(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast((cp >> 6) | 0xc0); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast((cp >> 12) | 0xe0); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast((cp >> 18) | 0xf0); - *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); - *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast((cp & 0x3f) | 0x80); - } - return result; - } - - template - uint32_t next(octet_iterator& it) - { - uint32_t cp = utf8::internal::mask8(*it); - typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); - switch (length) { - case 1: - break; - case 2: - it++; - cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); - break; - case 3: - ++it; - cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - ++it; - cp += (*it) & 0x3f; - break; - case 4: - ++it; - cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - ++it; - cp += (utf8::internal::mask8(*it) << 6) & 0xfff; - ++it; - cp += (*it) & 0x3f; - break; - } - ++it; - return cp; - } - - template - uint32_t peek_next(octet_iterator it) - { - return utf8::unchecked::next(it); - } - - template - uint32_t prior(octet_iterator& it) - { - while (utf8::internal::is_trail(*(--it))) ; - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - - // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) - template - inline uint32_t previous(octet_iterator& it) - { - return utf8::unchecked::prior(it); - } - - template - void advance (octet_iterator& it, distance_type n) - { - for (distance_type i = 0; i < n; ++i) - utf8::unchecked::next(it); - } - - template - typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::unchecked::next(first); - return dist; - } - - template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - } - result = utf8::unchecked::append(cp, result); - } - return result; - } - - template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - uint32_t cp = utf8::unchecked::next(start); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast(cp); - } - return result; - } - - template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::unchecked::append(*(start++), result); - - return result; - } - - template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = utf8::unchecked::next(start); - - return result; - } - - // The iterator class - template - class iterator : public std::iterator { - octet_iterator it; - public: - iterator () {} - explicit iterator (const octet_iterator& octet_it): it(octet_it) {} - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - bool operator == (const iterator& rhs) const - { - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - ::std::advance(it, utf8::internal::sequence_length(it)); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - ::std::advance(it, utf8::internal::sequence_length(it)); - return temp; - } - iterator& operator -- () - { - utf8::unchecked::prior(it); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::unchecked::prior(it); - return temp; - } - }; // class iterator - - } // namespace utf8::unchecked -} // namespace utf8 - - -#endif // header guard - diff --git a/utf8/utf8_fwrap.h b/utf8/utf8_fwrap.h deleted file mode 100644 index 5d41b6b..0000000 --- a/utf8/utf8_fwrap.h +++ /dev/null @@ -1,140 +0,0 @@ -#ifndef _UTF8_FWRAP_HPP -#define _UTF8_FWRAP_HPP - -#include -#include -#include -#include -#include -#include -#include - -#ifdef _WIN32 - #define utf32to8 utf16to8 -#endif - -inline wint_t fgetwc_u8(FILE *in) { -#ifdef _WIN32 - struct _cps { - FILE *f = 0; - wchar_t c = 0; - }; - static _cps cps[4]; - - for (auto& cp : cps) { - if (cp.f == in) { - cp.f = 0; - return cp.c; - } - } -#endif - - int32_t rv = 0; - int c = 0, i = 0; - char buf[4]; - if ((c = fgetc_unlocked(in)) != EOF) { - buf[i++] = static_cast(c); - if ((c & 0xF0) == 0xF0) { - if (fread_unlocked(buf+i, 1, 3, in) != 3) { - throw std::runtime_error("Could not read 3 expected bytes from stream"); - } - i += 3; - } - else if ((c & 0xE0) == 0xE0) { - if (fread_unlocked(buf+i, 1, 2, in) != 2) { - throw std::runtime_error("Could not read 2 expected bytes from stream"); - } - i += 2; - } - else if ((c & 0xC0) == 0xC0) { - if (fread_unlocked(buf+i, 1, 1, in) != 1) { - throw std::runtime_error("Could not read 1 expected byte from stream"); - } - i += 1; - } - } - if (i == 0 && c == EOF) { - rv = WEOF; - } - else { -#ifdef _WIN32 - wchar_t u16[2] = {}; - utf8::unchecked::utf8to16(buf, buf+i, u16); - - if (u16[1]) { - for (auto& cp : cps) { - if (cp.f == 0) { - cp.f = in; - cp.c = u16[1]; - return u16[0]; - } - } - throw std::runtime_error("Not enough space to store UTF-16 high surrogate"); - } - rv = u16[0]; -#else - utf8::unchecked::utf8to32(buf, buf+i, &rv); -#endif - } - return static_cast(rv); -} - -inline wint_t fputwc_u8(wint_t wc, FILE *out) { - char buf[4] = {}; - char *e = utf8::unchecked::utf32to8(&wc, &wc+1, buf); - if (fwrite_unlocked(buf, 1, e-buf, out) != static_cast(e-buf)) { - return WEOF; - } - - return wc; -} - -inline int fputws_u8(const wchar_t* str, FILE *out) { - static std::string buf; - buf.clear(); - size_t len = wcslen(str); - utf8::unchecked::utf32to8(str, str+len, std::back_inserter(buf)); - if (fwrite_unlocked(&buf[0], 1, buf.size(), out) != buf.size()) { - return WEOF; - } - - return 1; -} - -inline wint_t ungetwc_u8(wint_t wc, FILE *out) { - char buf[4] = {}; - char *e = utf8::unchecked::utf32to8(&wc, &wc+1, buf); - for (char *b = buf ; b != e ; ++b) { - if (ungetc(*b, out) == EOF) { - return WEOF; - } - } - - return wc; -} - -#ifdef fgetwc_unlocked - #undef fgetwc_unlocked -#endif -#define fgetwc_unlocked fgetwc_u8 - -#ifdef fputwc_unlocked - #undef fputwc_unlocked -#endif -#define fputwc_unlocked fputwc_u8 - -#ifdef fputws_unlocked - #undef fputws_unlocked -#endif -#define fputws_unlocked fputws_u8 - -#ifdef ungetwc - #undef ungetwc -#endif -#define ungetwc ungetwc_u8 - -#ifdef _WIN32 - #undef utf32to8 -#endif - -#endif