Index: branches/apertium-tagger/experiments/add_to_wikitable.py
===================================================================
--- branches/apertium-tagger/experiments/add_to_wikitable.py (revision 70990)
+++ branches/apertium-tagger/experiments/add_to_wikitable.py (nonexistent)
@@ -1,231 +0,0 @@
-# -- encoding: utf-8 --
-
-import sys
-import locale
-import mwparserfromhell
-from mwparserfromhell.nodes.tag import Tag
-from mwparserfromhell.wikicode import Wikicode
-
-TAGGER_ORDER = ['1st', 'unigram1', 'unigram2', 'unigram3', 'bigram', 'lwsw']
-
-
-def rdict(d):
- return {v: k for k, v in d.items()}
-
-LANG_CODE_NAME_MAP = {
- 'cat': 'Catalan',
- 'spa': 'Spanish',
- 'hbs': 'Serbo-Croatian',
- 'rus': 'Russian',
- 'kaz': 'Kazakh',
- 'por': 'Portuguese',
- 'swe': 'Swedish',
-}
-LANG_NAME_CODE_MAP = rdict(LANG_CODE_NAME_MAP)
-
-
-def name_to_attrs(name):
- attrs = {}
- for tagger in TAGGER_ORDER:
- if tagger in name:
- attrs['tagger'] = tagger
-
- if 'cg' in name:
- attrs['cg'] = True
- else:
- attrs['cg'] = False
-
- if attrs['tagger'] == 'lwsw':
- attrs['sup'] = None
- elif 'unsup' in name:
- attrs['sup'] = False
- elif 'sup' in name:
- attrs['sup'] = True
- else:
- attrs['sup'] = None
-
- if '_i' in name:
- attrs['iters'] = int(name.split('_i')[1])
- else:
- attrs['iters'] = None
-
- return attrs
-
-
-def attrs_to_sort_tuple(attrs):
- # tagger; unsup, sup; nocg, cg; iters
- return (TAGGER_ORDER.index(attrs['tagger']),
- attrs['sup'], attrs['cg'], attrs['iters'])
-
-
-def attrs_to_str(attrs):
- if attrs['tagger'].startswith('unigram'):
- out = 'Unigram model ' + attrs['tagger'][len('unigram'):]
- elif attrs['tagger'] == '1st':
- out = attrs['tagger']
- else:
- out = attrs['tagger'].title()
-
- if (attrs['cg']):
- out = "CG→" + out
-
- if attrs['sup'] is not None or attrs['iters'] is not None:
- bits = []
- if attrs['sup'] is not None:
- bits.append('sup' if attrs['sup'] else 'unsup')
- if attrs['iters'] is not None:
- bits.append('{} iters'.format(attrs['iters']))
- out += ' ({})'.format(', '.join(bits))
-
- return out
-
-
-def value_to_str(value):
- if hasattr(value, "__getitem__"):
- return "{2:.2f}±{3:.2f}".format(*(v * 100 for v in value))
- else:
- return "{0:.2f}".format(value * 100)
-
-
-def result_to_str(result):
- return '{}, {}'.format(value_to_str(result[0]), value_to_str(result[1]))
-
-
-def mk_title_td(title):
- return Tag(
- 'td',
- wiki_markup='|',
- contents=" '''{}''' ".format(title),
- closing_wiki_markup='')
-
-
-def mk_val_td(val, is_last=False):
- return Tag(
- 'td',
- wiki_markup='||',
- attrs=['align=right'],
- contents=" {} {}".format(val, "\n" if is_last else ""),
- wiki_style_separator='|',
- closing_wiki_markup='')
-
-
-def mk_empty_td(is_last=False):
- return Tag(
- 'td',
- wiki_markup='||',
- contents="\n" if is_last else "",
- closing_wiki_markup='')
-
-
-def mk_wc_td(val, is_first=False, is_last=False):
- return Tag(
- 'td',
- wiki_markup='!' if is_first else '!!',
- contents=" {}{}".format(val, "\n" if is_last else " "),
- closing_wiki_markup='')
-
-
-def mk_initial_tr(title):
- return Tag(
- 'tr',
- wiki_markup='|-\n',
- contents=Wikicode([mk_title_td(title), mk_empty_td(is_last=True)]),
- closing_wiki_markup='')
-
-input_table = sys.stdin.read()
-
-lang_order = []
-
-table = mwparserfromhell.parse(input_table.strip())
-table_inner = table.get(0).contents
-headings = table_inner.get(2).contents.nodes
-for tag in headings:
- if not isinstance(tag, Tag):
- continue
- title = tag.contents.strip()
- if not title:
- continue
- lang_order.append(LANG_NAME_CODE_MAP[title])
-
-
-def insert_into_tr(tr, col_idx, val_str):
- if len(tr.contents.nodes) <= col_idx:
- last_td = tr.contents.get(-1)
- if last_td.contents.endswith('\n'):
- last_td.contents = last_td.contents[:-1]
- while len(tr.contents.nodes) < col_idx:
- tr.contents.append(mk_empty_td())
- tr.contents.append(mk_empty_td(is_last=True))
- target_cell = tr.contents.get(col_idx)
- has_newline = target_cell.contents.endswith('\n')
- val_td = mk_val_td(val_str, is_last=has_newline)
- tr.contents.set(col_idx, val_td)
-
-
-def insert_into_wc(tr, col_idx, val_str):
- target_cell = tr.contents.get(col_idx)
- has_newline = target_cell.contents.endswith('\n')
- is_first = len(target_cell.wiki_markup) == 1
- val_td = mk_wc_td(val_str, is_first=is_first, is_last=has_newline)
- tr.contents.set(col_idx, val_td)
-
-
-def format_word_count(word_count):
- locale.setlocale(locale.LC_ALL, 'en_US')
- number = locale.format("%d", word_count, grouping=True)
- return "{}".format(number)
-
-if sys.argv[1] in LANG_CODE_NAME_MAP:
- # blank out column
- col_idx = lang_order.index(sys.argv[1]) + 1
- table_idx = 3
- while table_idx < len(table_inner.nodes):
- tr = table_inner.get(table_idx)
- if len(tr.contents.nodes) > col_idx:
- if tr.contents.get(col_idx).contents.endswith('\n'):
- tr.contents.get(col_idx).contents = '\n'
- else:
- tr.contents.get(col_idx).contents = ''
- table_idx += 1
- print(table)
- sys.exit()
-
-input_data = {}
-
-for arg in sys.argv[1:]:
- i = eval(open(arg).read())
- for k in i:
- input_data[k] = i[k]
-
-for lang, data in input_data.items():
- lang_idx = lang_order.index(lang)
- col_idx = lang_idx + 1
- word_count = data.pop('word_count', None)
- if word_count is not None:
- word_count_tr = table_inner.get(3)
- insert_into_wc(word_count_tr, col_idx, format_word_count(word_count))
- data = [(name_to_attrs(name), result_to_str(value))
- for name, value in data.items()]
- data = sorted(data, key=lambda pair: attrs_to_sort_tuple(pair[0]))
- table_idx = 4
- for attrs, val_str in data:
- title_str = attrs_to_str(attrs)
- while table_idx < len(table_inner.nodes):
- tr = table_inner.get(table_idx)
- if len(tr.contents) > 1:
- cell_contents = tr.contents.get(0).contents
- existing_title_str = str(cell_contents).strip(' ').strip("'")
- if existing_title_str == title_str:
- # insert into existing
- insert_into_tr(tr, col_idx, val_str)
- break
- else:
- table_inner.remove(tr, recursive=False)
- table_idx += 1
- else:
- # append to end
- tr = mk_initial_tr(title_str)
- insert_into_tr(tr, col_idx, val_str)
- table_inner.append(tr)
-
-print(table)
Index: branches/apertium-tagger/experiments/mtx/allflattagswrdbigram.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/allflattagswrdbigram.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/allflattagswrdbigram.mtx (revision 70993)
@@ -0,0 +1,40 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/carttagswrdbigram.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/carttagswrdbigram.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/carttagswrdbigram.mtx (revision 70993)
@@ -0,0 +1,32 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/commondefns.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/commondefns.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/commondefns.mtx (revision 70993)
@@ -0,0 +1,48 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/majortagsbigram.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/majortagsbigram.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/majortagsbigram.mtx (revision 70993)
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/morphodita.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/morphodita.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/morphodita.mtx (revision 70993)
@@ -0,0 +1,387 @@
+
+
+
+]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ &commondefns;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/proposed.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/proposed.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/proposed.mtx (revision 70993)
@@ -0,0 +1,50 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ...
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/spacyflattags.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/spacyflattags.mtx (revision 70990)
+++ branches/apertium-tagger/experiments/mtx/spacyflattags.mtx (revision 70993)
@@ -1,17 +1,31 @@
+
+
+
+]>
+
+ &commondefns;
+
+
-
-
-
+
+
+
+
-
-
+
+
-
+
-
+
+
@@ -20,6 +34,7 @@
+
@@ -26,6 +41,7 @@
+
@@ -32,6 +48,7 @@
+
@@ -39,22 +56,16 @@
+
-
-
-
-
-
-
-
-
+
@@ -64,6 +75,7 @@
+
@@ -72,6 +84,7 @@
+
@@ -82,6 +95,7 @@
+
@@ -90,6 +104,7 @@
+
@@ -98,6 +113,7 @@
+
@@ -108,12 +124,16 @@
+
-
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/unigram_model1.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/unigram_model1.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/unigram_model1.mtx (revision 70993)
@@ -0,0 +1,32 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/unigram_model2.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/unigram_model2.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/unigram_model2.mtx (revision 70993)
@@ -0,0 +1,56 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/mtx/unigram_model3.mtx
===================================================================
--- branches/apertium-tagger/experiments/mtx/unigram_model3.mtx (nonexistent)
+++ branches/apertium-tagger/experiments/mtx/unigram_model3.mtx (revision 70993)
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: branches/apertium-tagger/experiments/run_experiment.py
===================================================================
--- branches/apertium-tagger/experiments/run_experiment.py (revision 70990)
+++ branches/apertium-tagger/experiments/run_experiment.py (revision 70993)
@@ -55,8 +55,11 @@
'cg:texts/raio.tagged.txt',
],
'swe': [
- 'cgr:texts/tid.tagged.txt'
+ 'cgr:texts/tid.tagged.txt',
],
+ 'ita': [
+ 'cg:texts/puupankki/puupankki.ita.vislcg',
+ ],
}
TSX_MAP = {
'hbs': 'apertium-hbs.hbs-coarse.tsx',
@@ -132,7 +135,7 @@
return parser.parse_args()
-PREPROCESSER_MAP = {
+PREPROCESSOR_MAP = {
'cg': cg_conv_clean,
'cgr': functools.partial(cg_conv_clean, rtl=True),
}
@@ -209,10 +212,10 @@
self.text_fns = []
for text in texts:
if ':' in text:
- preprocesser_name, text = text.split(':', 1)
+ preprocessor_name, text = text.split(':', 1)
else:
- preprocesser_name = None
- self.text_fns.append((preprocesser_name,
+ preprocessor_name = None
+ self.text_fns.append((preprocessor_name,
pjoin(lang_root, text)))
self.joined_fn = pjoin(self.work_dir, 'joined')
self.ref_fn = pjoin(self.work_dir, 'ref')
@@ -260,13 +263,13 @@
mkdir(self.work_dir)
preprocessed_texts = []
- for i, (preprocesser_name, fn) in enumerate(self.text_fns):
- if preprocesser_name:
- preprocesser = PREPROCESSER_MAP.get(preprocesser_name)
+ for i, (preprocessor_name, fn) in enumerate(self.text_fns):
+ if preprocessor_name:
+ preprocessor = PREPROCESSOR_MAP.get(preprocessor_name)
cleaned_fn = pjoin(
self.work_dir,
- 'cleaned.{}.{}.txt'.format(i, preprocesser_name))
- preprocesser(input=fn, output=cleaned_fn)
+ 'cleaned.{}.{}.txt'.format(i, preprocessor_name))
+ preprocessor(input=fn, output=cleaned_fn)
preprocessed_texts.append(cleaned_fn)
else:
preprocessed_texts.append(fn)
Index: languages/apertium-ita/texts/puupankki/puupankki.ita.vislcg
===================================================================
--- languages/apertium-ita/texts/puupankki/puupankki.ita.vislcg (revision 70990)
+++ languages/apertium-ita/texts/puupankki/puupankki.ita.vislcg (revision 70993)
@@ -13209,7 +13209,6 @@
"," cm
""
"*AT"
-&$
""
"*amp"
"<;>"