commit 3e09b53d4d186c9a4f5a57b386d0958e0481c49a Author: vivekvardhanadepu Date: Sun May 23 18:18:33 2021 +0530 lexical training script init diff --git a/README.md b/README.md index 21463b6..7984ab5 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ pre-training: preProcessing.sh lang-models: make_lang_model.sh -alignment: alignment.sh[using fast_align, [Chris Dyer](http://www.cs.cmu.edu/~cdyer), [Victor Chahuneau](http://victor.chahuneau.fr), and [Noah A. Smith](http://www.cs.cmu.edu/~nasmith). (2013). [A Simple, Fast, and Effective Reparameterization of IBM Model 2](http://www.ark.cs.cmu.edu/cdyer/fast_valign.pdf). In *Proc. of NAACL*. +alignment: alignment.sh[using fast_align, [Chris Dyer](http://www.cs.cmu.edu/~cdyer), [Victor Chahuneau](http://victor.chahuneau.fr), and [Noah A. Smith](http://www.cs.cmu.edu/~nasmith). (2013). [A Simple, Fast, and Effective Reparameterization of IBM Model 2](http://www.ark.cs.cmu.edu/cdyer/fast_valign.pdf). In *Proc. of NAACL*.] rule-extraction: rule_extraction.sh diff --git a/coding_challenges/parser_test.out b/coding_challenges/parser_test.out deleted file mode 100644 index a351e5b..0000000 --- a/coding_challenges/parser_test.out +++ /dev/null @@ -1 +0,0 @@ -{'title': 'TOML Example', 'owner': {'name': 'Tom Preston-Werner', 'dob': DateTime(1979, 5, 27, 7, 32, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=57600), '-08:00'))}, 'database': {'server': '192.168.1.1', 'ports': [8000, 8001, 8002], 'connection_max': 5000, 'enabled': True}, 'servers': {'alpha': {'ip': '10.0.0.1', 'dc': 'eqdc10'}, 'beta': {'ip': '10.0.0.2', 'dc': 'eqdc10'}}, 'clients': {'data': [['gamma', 'delta'], [1, 2]], 'hosts': ['alpha', 'omega'], 'str1': 'Roses are red\nViolets are blue', 'str2': 'The quick brown fox jumps over the lazy dog.', 'float4': 5e+22, 'float5': 1000000.0, 'float6': -0.02, 'float7': 6.626e-34, 'float8': 224617.445991228, 'infinite1': inf, 'infinite2': inf, 'infinite3': -inf, 'not1': nan, 'not2': nan, 're': '\\d{2} apps is t[wo]o many', 'lines': 'The first newline is\ntrimmed in raw strings.\nAll other whitespace\nis preserved.\n'}} diff --git a/coding_challenges/parser_test.toml b/coding_challenges/parser_test.toml deleted file mode 100644 index 9f39c18..0000000 --- a/coding_challenges/parser_test.toml +++ /dev/null @@ -1,71 +0,0 @@ -# This is a TOML document. - -title = "TOML Example" - -[owner] -name = "Tom Preston-Werner" -dob = 1979-05-27T07:32:00-08:00 # First class dates - -[database] -server = "192.168.1.1" -ports = [ 8000, 8001, 8002 ] -connection_max = 5000 -enabled = true - -[servers] - - # Indentation (tabs and/or spaces) is allowed but not required - [servers.alpha] - ip = "10.0.0.1" - dc = "eqdc10" - - [servers.beta] - ip = "10.0.0.2" - dc = "eqdc10" - -[clients] -data = [ ["gamma", "delta"], [1, 2] ] - -# Line breaks are OK when inside arrays -hosts = [ - "alpha", - "omega" -] - -str1 = """ -Roses are red -Violets are blue""" - -str2 = """\ - The quick brown \ - fox jumps over \ - the lazy dog.\ - """ - - # exponent -float4 = 5e+22 -float5 = 1e06 -float6 = -2E-2 - -# both -float7 = 6.626e-34 - -# separators -float8 = 224_617.445_991_228 - -# infinity -infinite1 = inf # positive infinity -infinite2 = +inf # positive infinity -infinite3 = -inf # negative infinity - -# not a number -not1 = nan -not2 = +nan - -re = '''\d{2} apps is t[wo]o many''' -lines = ''' -The first newline is -trimmed in raw strings. -All other whitespace -is preserved. -''' \ No newline at end of file diff --git a/coding_challenges/toml_parser.py b/coding_challenges/toml_parser.py deleted file mode 100644 index af4a056..0000000 --- a/coding_challenges/toml_parser.py +++ /dev/null @@ -1,17 +0,0 @@ -from tomlkit import parse, dumps - -def main(): - with open('parser_test.toml') as test_file: - data_toml = test_file.read() - data_json = parse(data_toml) - - # gives error if not parsed well - assert data_toml == dumps(data_json) - - # outputting as a dictionary - with open('parser_test.out', 'w') as json_file: - print(data_json, file=json_file) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..9ba35b5 --- /dev/null +++ b/config.toml @@ -0,0 +1,25 @@ +# configuration for lexical training + +# corpus name +CORPUS = + +# source language +SL = + +# target language +TL = + +# source corpus +CORPUS_SL = + +# target corpus +CORPUS_TL = + +# apertium-lex-tools scripts +LEX_TOOLS = + +# apertium language data +DATA = + +# fast align build folder +FAST_ALIGN = \ No newline at end of file diff --git a/config_parser.py b/config_parser.py new file mode 100644 index 0000000..fb544d8 --- /dev/null +++ b/config_parser.py @@ -0,0 +1,14 @@ +from tomlkit import parse, dumps + +def parse_config(filename='config.toml'): + with open(filename) as config_file: + config_toml = config_file.read() + config = parse(config_toml) + + # gives error if not parsed well + assert config_toml == dumps(config) + + return config + +if __name__ == '__main__': + parse_config() \ No newline at end of file diff --git a/lexical_training.log b/lexical_training.log new file mode 100644 index 0000000..e69de29 diff --git a/lexical_training.py b/lexical_training.py new file mode 100644 index 0000000..1208a96 --- /dev/null +++ b/lexical_training.py @@ -0,0 +1,9 @@ +# lexical training script +from config_parser import parse_config + +def main(): + config = parse_config() + print(config) + +if __name__ == '__main__': + main() \ No newline at end of file