commit 0eb748fd019d0f11449747ddc5711ba28f9f5aa2 Author: Daniel Swanson Date: Thu Jun 3 12:43:24 2021 -0500 cleverness is to be avoided (investigating #85) diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index 7e14624..1ad56d0 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -71,7 +71,8 @@ AttCompiler::convert_hfst(UString& symbol) bool AttCompiler::is_word_punct(UChar symbol) { - return u_charType(symbol) & (U_NON_SPACING_MARK | U_ENCLOSING_MARK | U_COMBINING_SPACING_MARK); + // this version isn't quite write, but something like it should be possible + //return u_charType(symbol) & (U_NON_SPACING_MARK | U_ENCLOSING_MARK | U_COMBINING_SPACING_MARK); // https://en.wikipedia.org/wiki/Combining_character#Unicode_ranges if((symbol >= 0x0300 && symbol <= 0x036F) // Combining Diacritics || (symbol >= 0x1AB0 && symbol <= 0x1AFF) // ... Extended diff --git a/lttoolbox/input_file.cc b/lttoolbox/input_file.cc index ac6eda5..e7dea2f 100644 --- a/lttoolbox/input_file.cc +++ b/lttoolbox/input_file.cc @@ -56,27 +56,21 @@ InputFile::internal_read() ubuffer[buffer_size++] = '\0'; return; } - switch (cbuffer[0] & 0xF0) { - case 0xF0: + if ((cbuffer[0] & 0xF0) == 0xF0) { i += 3; if (fread(cbuffer+1, 1, 3, infile) != 3) { throw std::runtime_error("Could not read 3 expected bytes from stream"); } - break; - case 0xE0: + } else if ((cbuffer[0] & 0xE0) == 0xE0) { i += 2; if (fread(cbuffer+1, 1, 2, infile) != 2) { throw std::runtime_error("Could not read 2 expected bytes from stream"); } - break; - case 0xC0: + } else if ((cbuffer[0] & 0xC0) == 0xC0) { i += 1; if (fread(cbuffer+1, 1, 1, infile) != 1) { throw std::runtime_error("Could not read 1 expected byte from stream"); } - break; - default: - break; } memset(ubuffer, 0, 3*sizeof(UChar)); utf8::utf8to32(cbuffer, cbuffer+i, ubuffer);