commit 2de39d0334561ffaa69444687d6fd8b870f16585 Author: vivekvardhanadepu Date: Thu Jun 24 16:03:00 2021 +0530 minor fixes diff --git a/clean_corpus.py b/clean_corpus.py index ce84b89..2aac7fa 100644 --- a/clean_corpus.py +++ b/clean_corpus.py @@ -4,6 +4,7 @@ import sys +import re def clean_corpus(corpus1, corpus2): @@ -17,44 +18,46 @@ def clean_corpus(corpus1, corpus2): lines2 = l2.readlines() assert len(lines1) == len(lines2) # print(lines1, lines2) - i = 0 for i in range(len(lines1)): - if (not lines1[i].strip()) or (not lines2[i].strip()): + removal_map = "".maketrans('', '', '°.*') + if (not lines1[i].translate(removal_map).strip()) != (not lines2[i].translate(removal_map).strip()): lines_to_remove.update([i-1, i, i+1]) continue - + # removing lines only with '°', '*' and '.' - if (not lines1[i].replace('°', '').replace('*', '').replace('.','').strip()) and \ - (not lines2[i].replace('°', '').replace('*', '').replace('.', '').strip()): - lines_to_remove.add(i) + # if (not lines1[i].replace('°', '').replace('*', '').replace('.', '').strip()) and \ + # (not lines2[i].replace('°', '').replace('*', '').replace('.', '').strip()): + # lines_to_remove.add(i) # print(lines1, lines2) - + if (not lines1[i].translate(removal_map).strip()) and \ + (not lines2[i].translate(removal_map).strip()): + lines_to_remove.add(i) + # print(lines_to_remove) - l1.seek(0) - # l1.write(''.join(lines1)) - l1.write('') - l1.truncate() + # l1.seek(0) + # # l1.write(''.join(lines1)) + # l1.write('') + l1.truncate(0) - l2.seek(0) - l2.write('') - l2.truncate() + # l2.seek(0) + # l2.write('') + l2.truncate(0) - with open(corpus1, 'a') as l1, open(corpus2, 'a') as l2: + with open(corpus1, 'w') as l1, open(corpus2, 'w') as l2: lines_to_keep = set() lines_to_keep.update([i for i in range(len(lines1))]) lines_to_keep = lines_to_keep - lines_to_remove - - for i in sorted(lines_to_keep): - # also removing leading and trailing spaces - l1.write(lines1[i].strip() + '\n') - l2.write(lines2[i].strip() + '\n') - - l1.truncate() - l2.truncate() + + # also removing leading and trailing spaces + l1.writelines( + re.sub(' +', ' ', lines1[i]).strip()+'\n' for i in sorted(lines_to_keep)) + l2.writelines( + re.sub(' +', ' ', lines2[i]).strip()+'\n' for i in sorted(lines_to_keep)) + if __name__ == '__main__': if len(sys.argv) != 3: print('usage: clean_corpus.py ') exit(1) - clean_corpus(sys.argv[1], sys.argv[2]) \ No newline at end of file + clean_corpus(sys.argv[1], sys.argv[2]) diff --git a/lexical_training.py b/lexical_training.py index 3be2eb8..d74a2af 100644 --- a/lexical_training.py +++ b/lexical_training.py @@ -111,7 +111,7 @@ def training(config, cache_dir, log): cmds = [['head', '-n', str(training_lines)], ['apertium', '-d', config['LANG_DATA'], config['SL']+'-'+config['TL']+'-tagger'], - ['sed', 's/ \+/ /g'], ['apertium-pretransfer']] + ['apertium-pretransfer']] with open(config['CORPUS_SL']) as inp, open(sl_tagged, 'w') as outp: pipe(cmds, inp, outp, log).wait() @@ -122,7 +122,7 @@ def training(config, cache_dir, log): cmds = [['head', '-n', str(training_lines)], ['apertium', '-d', config['LANG_DATA'], config['TL']+'-'+config['SL']+'-tagger'], - ['sed', 's/ \+/ /g'], ['apertium-pretransfer']] + ['apertium-pretransfer']] with open(config['CORPUS_TL']) as inp, open(tl_tagged, 'w') as outp: pipe(cmds, inp, outp, log).wait() @@ -130,30 +130,32 @@ def training(config, cache_dir, log): # p2 = Popen(c2, stdin=p1.stdout, stdout=PIPE, stderr=training_log) # removing lines with no analyses - with open(lines, 'w+') as f0: + with open(lines, 'w') as f: call(['seq', '1', str(training_lines)], - stdout=f0, stderr=log) - clean_tagged = os.path.join( - cache_dir, config['CORPUS']+'.clean_tagged') - with open(clean_tagged, 'w+') as f1: - cmds = [['paste', lines, sl_tagged, tl_tagged], - ['grep', '<*\t*<']] - pipe(cmds, None, f1, log).wait() - - # f1.seek(0) - call(['cut', '-f', '1'], stdin=f1, stdout=f0, stderr=log) - - f1.seek(0) - with open(sl_tagged, 'w') as f2: - cmds = [['cut', '-f', '2'], ['sed', 's/ /~/g'], - ['sed', 's/\$[^\^]*/$ /g']] - pipe(cmds, f1, f2, log).wait() - - f1.seek(0) - with open(tl_tagged, 'w') as f2: - cmds = [['cut', '-f', '3'], ['sed', 's/ /~/g'], - ['sed', 's/\$[^\^]*/$ /g']] - pipe(cmds, f1, f2, log).wait() + stdout=f, stderr=log) + + clean_tagged = os.path.join( + cache_dir, config['CORPUS']+'.clean_tagged') + with open(clean_tagged, 'w') as f1: + cmds = [['paste', lines, sl_tagged, tl_tagged], + ['grep', '<*\t*<']] + pipe(cmds, None, f1, log).wait() + + with open(clean_tagged, 'r') as f0: + with open(lines, 'w') as f1: + call(['cut', '-f', '1'], stdin=f0, stdout=f1, stderr=log) + + f0.seek(0) + with open(sl_tagged, 'w') as f2: + cmds = [['cut', '-f', '2'], ['sed', 's/ /~/g'], + ['sed', 's/\$[^\^]*/$ /g']] + pipe(cmds, f0, f2, log).wait() + + f0.seek(0) + with open(tl_tagged, 'w') as f2: + cmds = [['cut', '-f', '3'], ['sed', 's/ /~/g'], + ['sed', 's/\$[^\^]*/$ /g']] + pipe(cmds, f0, f2, log).wait() os.remove(clean_tagged) @@ -162,6 +164,7 @@ def training(config, cache_dir, log): with open(os.devnull, 'r') as f1: call(['paste', '-d', '||| ', tl_tagged, '-', '-', '-', sl_tagged], stdin=f1, stdout=f, stderr=log) + with open(alignment, 'w') as f: call([config['FAST_ALIGN'], '-i', tagged_merged, '-d', '-o', '-v'], stdout=f, stderr=log) @@ -170,6 +173,7 @@ def training(config, cache_dir, log): data = f.read() f.seek(0) f.write(data.replace('~', ' ')) + with open(tl_tagged, 'r+') as f: data = f.read() f.seek(0)