Index: branches/apertium-tagger/experiments/add_to_wikitable.py =================================================================== --- branches/apertium-tagger/experiments/add_to_wikitable.py (revision 68836) +++ branches/apertium-tagger/experiments/add_to_wikitable.py (revision 68838) @@ -134,7 +134,7 @@ last_td.contents = last_td.contents[:-1] while len(tr.contents.nodes) < col_idx: tr.contents.append(mk_empty_td()) - tr.contents.append(mk_empty_td(last_node=True)) + tr.contents.append(mk_empty_td(is_last=True)) target_cell = tr.contents.get(col_idx) has_newline = target_cell.contents.endswith('\n') print('target_cell', target_cell) Index: branches/apertium-tagger/experiments/requirements.txt =================================================================== --- branches/apertium-tagger/experiments/requirements.txt (revision 68836) +++ branches/apertium-tagger/experiments/requirements.txt (revision 68838) @@ -1,2 +1,3 @@ aitertools==0.1.0 tabulate==0.7.5 +-e git+https://github.com/frankier/streamparser.git@setup-py#egg=streamparser Index: branches/apertium-tagger/experiments/run_experiment.py =================================================================== --- branches/apertium-tagger/experiments/run_experiment.py (revision 68836) +++ branches/apertium-tagger/experiments/run_experiment.py (revision 68838) @@ -25,16 +25,19 @@ 'cat': ['texts/miscellaneous.tagged.txt'], 'spa': ['texts/miscellaneous.tagged.txt'], 'hbs': ['hbs-tagger-data/hbs.tagged.txt'], - 'rus': ['texts/son-smešnogo-čeloveka.ana.txt'], + 'rus': [ + # 'texts/son-smešnogo-čeloveka.ana.txt', incorrect format? + 'texts/dva-samoubijstva.ana.txt' # seems to be only one in correct format? + ], 'kaz': ['eval/ref.1000.txt'], 'por': [ - 'texts/bering.txt', - 'texts/cultura.txt', - 'texts/beringia.txt', - 'texts/raio.txt', - 'texts/música.txt', - 'texts/água.txt', - 'texts/akatsuki.txt', + 'texts/água.txt', # ambiguous + 'texts/raio.txt', # ambiguous + # 'texts/música.txt', ambiguous, syntax errors + 'texts/akatsuki.txt', # ambiguous + 'texts/beringia.txt', # ambiguous + 'texts/cultura.txt', # ambiguous + 'texts/bering.txt', # ambiguous ], } TSX_MAP = { @@ -42,8 +45,6 @@ } DEFAULT_LANGUAGES = ['cat', 'spa', 'hbs', 'rus', 'kaz', 'por', 'swe'] NO_TSX_LANGUAGES = ['rus'] -STRIP_AT_LANGUAGES = ['rus'] -AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>') def comma_list(s): @@ -88,6 +89,10 @@ '--reuse', help="Reuse preprocesed dictionary and corpa from previous run", action='store_true') + parser.add_argument( + '--output', + help="Output file for the results of the experiment", + action='store_string') return parser.parse_args() @@ -220,11 +225,20 @@ return cmd +AT_TAG_REGEX = re.compile('<@[←→+-]?[A-Z]+[←→+-]?>') +PRPERS_TAG = re.compile(r'/\+') + + @filter -def strip_at_tag(line): - return AT_TAG_REGEX.sub('', line) +def cleanup_rus(line): + return PRPERS_TAG.sub('/', AT_TAG_REGEX.sub('', line)) +TEXT_CLEANUP_MAP = { + 'rus': cleanup_rus, +} + + @filter(iter_filter=True) def strip_unknown_sent(gen): buff = [] @@ -362,8 +376,8 @@ joined = itertools.chain(*(open(fn).readlines() for fn in self.text_fns)) - if self.lang in STRIP_AT_LANGUAGES: - strip_unknown_in = strip_at_tag(joined) + if self.lang in TEXT_CLEANUP_MAP: + strip_unknown_in = TEXT_CLEANUP_MAP[self.lang](joined) else: strip_unknown_in = joined strip_unknown_sent(strip_unknown_in, self.joined_fn) @@ -519,6 +533,9 @@ finally: result_pretty = pformat(languages_tagger_accuracies) print(result_pretty) + if args.output: + outf = args.output + else: outf = pjoin(TMPDIR, 'result-{}.pyson' .format(datetime.datetime.now().isoformat())) open(outf, 'w').write(result_pretty) Index: languages/apertium-srd/Jenkinsfile =================================================================== --- languages/apertium-srd/Jenkinsfile (nonexistent) +++ languages/apertium-srd/Jenkinsfile (revision 68838) @@ -0,0 +1,7 @@ +node { + stage 'Checkout' + checkout scm + + stage 'Build' + sh "./autogen.sh && make clean && make" +}