commit 52cc3be3994d777031e28f4dd015bddadf0de303
Author: vivekvardhanadepu <vivekvicky839@gmail.com>
Date:   Sun Aug 1 15:54:07 2021 +0530

    check_config and github actions for non-parallel training added

diff --git a/.github/workflows/training.yml b/.github/workflows/training.yml
index 2c70d5c..4477189 100644
--- a/.github/workflows/training.yml
+++ b/.github/workflows/training.yml
@@ -18,18 +18,18 @@ jobs:
           sudo apt-get -qfy install python3-pip
           pip3 install -r requirements.txt
           
-      - name: run
+      - name: Parallel
         run: "! python3 check_config.py tests/training/config.toml"
 
-  training:
-    name: lexical selection training
+      - name: Non-parallel
+        run: "! python3 check_config.py tests/training/config-np.toml"
+
+  parallel_training:
+    name: parallel corpora training
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
 
-      # - name: Running check_config.py(before installation of prerequisites)
-      #   run: python3 check_config.py tests/training/config.toml
-
       - name: Installing apertium dependencies
         run: |
           sudo apt-get -qy update
@@ -53,7 +53,7 @@ jobs:
           make -j4 VERBOSE=1 V=1
           cd ..
 
-      - name: checking out apertium-eng-spa
+      - name: Checking out apertium-eng-spa
         uses: actions/checkout@v2
         with:
           repository: apertium/apertium-eng-spa
@@ -73,3 +73,52 @@ jobs:
           
       - name: Training
         run: python3 lexical_selection_training.py tests/training/config.toml
+    
+  non_parallel_training:
+    name: non-parallel corpora training
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Installing apertium dependencies
+        run: |
+          sudo apt-get -qy update
+          sudo apt-get -qfy install wget ca-certificates
+          wget -q https://apertium.projectjj.com/apt/install-nightly.sh -O - | sudo bash
+          sudo apt-get -qfy install --no-install-recommends apertium-all-dev
+
+      - name: Checking out apertium-eng-spa
+        uses: actions/checkout@v2
+        with:
+          repository: apertium/apertium-eng-spa
+          path: apertium-eng-spa
+
+      - name: Installing apertium-eng-spa locally
+        working-directory: apertium-eng-spa
+        run: |
+          autoreconf -fvi
+          ./configure
+          make -j4 VERBOSE=1 V=1
+
+      - name: Installing python dependencies
+        run: |
+          sudo apt-get -qfy install python3-pip
+          pip3 install -r requirements.txt
+
+      - name: Checking out IRSTLM
+        uses: actions/checkout@v2
+        with:
+          repository: irstlm-team/irstlm
+          path: irstlm
+
+      - name: Installing IRSTLM
+        working-directory: irstlm
+        run: |
+          sed -i 's/isystem/I/' src/Makefile.am
+          sh regenerate-makefiles.sh
+          ./configure
+          make -j4 VERBOSE=1 V=1
+          sudo make install
+        
+      - name: Training
+        run: python3 lexical_selection_training.py tests/training/config-np.toml
diff --git a/check_config.py b/check_config.py
index 533483e..5beccdd 100644
--- a/check_config.py
+++ b/check_config.py
@@ -11,13 +11,14 @@ fast_align_url = "https://github.com/clab/fast_align"
 langs_url = "https://wiki.apertium.org/wiki/List_of_language_pairs"
 apertium_url = "https://wiki.apertium.org/wiki/Installation"
 yasmet_url = "https://wiki.apertium.org/wiki/Using_weights_for_ambiguous_rules"
+irstlm_url = "https://wiki.apertium.org/wiki/IRSTLM"
 
 
-def check_config(filename='config.toml'):
+def check_config(config_filename):
     misconfigured = False
     lex_tools_paths = ['/opt/local/share/apertium-lex-tools',
                        '/usr/local/share/apertium-lex-tools', '/usr/share/apertium-lex-tools']
-    with open(filename) as config_file:
+    with open(config_filename) as config_file:
         config_toml = config_file.read()
         config = parse(config_toml)
 
@@ -25,7 +26,7 @@ def check_config(filename='config.toml'):
     assert config_toml == dumps(config)
 
     # changing the paths to absolute
-    for key in ['CORPUS_SL', 'CORPUS_TL', 'FAST_ALIGN', 'LANG_DATA']:
+    for key in ['CORPUS_SL', 'CORPUS_TL', 'LANG_DATA']:
         if not os.path.isabs(config[key]):
             config[key] = os.path.join(os.path.abspath('.'), config[key])
 
@@ -39,56 +40,19 @@ def check_config(filename='config.toml'):
             f"'{config['CORPUS_TL']}'(CORPUS_TL) is not a file, provide a valid file or \nto download, look {corpora_url}\n")
         misconfigured = True
 
-    is_lex_tools_present = False
-    for lex_tools in lex_tools_paths:
-        if os.path.isdir(lex_tools):
-            scripts = ['extract-sentences.py', 'extract-freq-lexicon.py',
-                       'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py',
-                       'ngrams-to-rules-me.py', 'common.py']
-
-            for script in scripts:
-                if not os.path.isfile(os.path.join(lex_tools, script)):
-                    print(
-                        f"'{script}' is not present in '{lex_tools}', re-install apertium-lex-tools {apertium_url}\n")
-                    misconfigured = True
-            is_lex_tools_present = True
-
-    if not is_lex_tools_present:
-        print(
-            f"'apertium_lex_tools'is not installed, to install apertium-lex-tools follow {apertium_url}\n")
-        misconfigured = True
-
-        # assuming scripts are intact
-        # if 'process-tagger-output' not in os.listdir(config['LEX_TOOLS']):
-        #     print("'process-tagger-output' is not in", "'"+config['LEX_TOOLS']+"'(LEX_TOOLS),",
-        #           "provide a valid directory or \nto install, follow", lex_tools_url, '\n')
-        #     misconfigured = True
-
-    if not os.path.isfile(config['FAST_ALIGN']):
-        print(
-            f"'{config['FAST_ALIGN']}'(FAST_ALIGN) is not a file, provide a valid executable or \nto install, follow {fast_align_url}\n")
-        misconfigured = True
-    # else:
-    #     if 'fast_align' not in os.listdir(config['FAST_ALIGN']):
-    #         print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \
-    #                         "provide a valid directory or \nto install, follow", fast_align_url, '\n')
-    #         misconfigured = True
-
     if not os.path.isdir(config['LANG_DATA']):
         print(
             f"'{config['LANG_DATA']}'(LANG_DATA) is not a directory, provide a valid directory or \nto install, follow {langs_url}\n")
         misconfigured = True
     else:
-        sl_tl_autobil = f"{config['SL']}-{config['TL']}.autobil.bin"
-        tl_sl_autobil = f"{config['TL']}-{config['SL']}.autobil.bin"
-        if sl_tl_autobil not in os.listdir(config['LANG_DATA']):
-            print(f"'{sl_tl_autobil}' is not in '{config['LANG_DATA']}'(LANG_DATA), \
-                  provide a valid directory or \nto install, follow {langs_url}\n")
-            misconfigured = True
-        if tl_sl_autobil not in os.listdir(config['LANG_DATA']):
-            print(f"'{tl_sl_autobil}' is not in '{config['LANG_DATA']}'(LANG_DATA), \
-                  provide a valid directory or \nto install, follow {langs_url}\n")
-            misconfigured = True
+        modules = []
+        modules.append(f"{config['SL']}-{config['TL']}.autobil.bin")
+        modules.append(f"{config['TL']}-{config['SL']}.autobil.bin")
+        for module in modules:
+            if module not in os.listdir(config['LANG_DATA']):
+                print(f"'{module}' is not in '{config['LANG_DATA']}'(LANG_DATA), \
+                    provide a valid directory or \nto install, follow {langs_url}\n")
+                misconfigured = True
 
     apertium_present = False
     for path in os.environ["PATH"].split(os.pathsep):
@@ -101,40 +65,159 @@ def check_config(filename='config.toml'):
             f"apertium is either not installed or not added to path, see {apertium_url}\n")
         misconfigured = True
 
-    yasmet_present = False
-    for path in os.environ["PATH"].split(os.pathsep):
-        if os.path.isfile(os.path.join(path, 'yasmet')):
-            yasmet_present = True
-            break
-
-    if not yasmet_present:
-        print(
-            f"yasmet is either not installed or not added to path, install yasmet and add to the path, \
-                {yasmet_url} or re-install apertium-lex-tools with yasmet, {apertium_url}\n")
-        misconfigured = True
-
-    process_tagger_output_present = False
-    for path in os.environ["PATH"].split(os.pathsep):
-        if os.path.isfile(os.path.join(path, 'process-tagger-output')):
-            process_tagger_output_present = True
-            break
-
-    if not process_tagger_output_present:
+    if not isinstance(config['TRAINING_LINES'], int):
         print(
-            f"process-tagger-output is either not installed or not added to path, re-install apertium-lex-tools {apertium_url}\n")
+            f"'{config['TRAINING_LINES']}'(TRAINING_LINES) is not an integer. pass an integer \n")
         misconfigured = True
 
-    if not isinstance(config['TRAINING_LINES'], int):
+    if not isinstance(config['IS_PARALLEL'], bool):
         print(
-            f"'{config['TRAINING_LINES']}'(TRAINING_LINES) is not an integer \n")
+            f"'{config['IS_PARALLEL']}'(IS_PARALLEL) is not an boolean. pass true or false \n")
         misconfigured = True
+    else:
+        if config['IS_PARALLEL']:
+            yasmet_present = False
+            for path in os.environ["PATH"].split(os.pathsep):
+                if os.path.isfile(os.path.join(path, 'yasmet')):
+                    yasmet_present = True
+                    break
+
+            if not yasmet_present:
+                print(
+                    f"yasmet is either not installed or not added to path, install yasmet and add to the path, \
+                        {yasmet_url} or \nre-install apertium-lex-tools with yasmet, {apertium_url}\n")
+                misconfigured = True
+
+            process_tagger_output_present = False
+            for path in os.environ["PATH"].split(os.pathsep):
+                if os.path.isfile(os.path.join(path, 'process-tagger-output')):
+                    process_tagger_output_present = True
+                    break
+
+            if not process_tagger_output_present:
+                print(
+                    f"process-tagger-output is not installed, re-install apertium-lex-tools {apertium_url}\n")
+                misconfigured = True
+
+            if not os.path.isabs(config['FAST_ALIGN']):
+                config['FAST_ALIGN'] = os.path.join(
+                    os.path.abspath('.'), config['FAST_ALIGN'])
+            if not os.path.isfile(config['FAST_ALIGN']):
+                print(
+                    f"'{config['FAST_ALIGN']}'(FAST_ALIGN) is not a file, provide a valid executable or \nto install, follow {fast_align_url}\n")
+                misconfigured = True
+            # else:
+            #     if 'fast_align' not in os.listdir(config['FAST_ALIGN']):
+            #         print("fast_align is not present in", "'"+config['FAST_ALIGN']+"'(FAST_ALIGN),", \
+            #                         "provide a valid directory or \nto install, follow", fast_align_url, '\n')
+            #         misconfigured = True
+
+            is_lex_tools_present = False
+            for lex_tools in lex_tools_paths:
+                if os.path.isdir(lex_tools):
+                    scripts = ['extract-sentences.py', 'extract-freq-lexicon.py',
+                               'ngram-count-patterns-maxent2.py', 'merge-ngrams-lambdas.py', 'lambdas-to-rules.py',
+                               'ngrams-to-rules-me.py', 'common.py']
+
+                    for script in scripts:
+                        if not os.path.isfile(os.path.join(lex_tools, script)):
+                            print(
+                                f"'{script}' is not present in '{lex_tools}', re-install apertium-lex-tools {apertium_url}\n")
+                            misconfigured = True
+                    is_lex_tools_present = True
+
+            if not is_lex_tools_present:
+                print(
+                    f"'apertium_lex_tools' is not installed, to install apertium-lex-tools follow {apertium_url}\n")
+                misconfigured = True
+
+        else:
+            if os.path.isdir(config['LANG_DATA']):
+                modules = []
+                modules.append(
+                    f"apertium-{config['SL']}-{config['TL']}.{config['SL']}-{config['TL']}.t1x")
+                modules.append(f"{config['SL']}-{config['TL']}.t1x.bin")
+                modules.append(
+                    f"apertium-{config['SL']}-{config['TL']}.{config['SL']}-{config['TL']}.t2x")
+                modules.append(f"{config['SL']}-{config['TL']}.t2x.bin")
+                modules.append(
+                    f"apertium-{config['SL']}-{config['TL']}.{config['SL']}-{config['TL']}.t3x")
+                modules.append(f"{config['SL']}-{config['TL']}.t3x.bin")
+                modules.append(f"{config['SL']}-{config['TL']}.autogen.bin")
+                modules.append(f"{config['SL']}-{config['TL']}.autopgen.bin")
+                for module in modules:
+                    if module not in os.listdir(config['LANG_DATA']):
+                        print(f"'{module}' is not in '{config['LANG_DATA']}'(LANG_DATA), \
+                            provide a valid directory or \nto install, follow {langs_url}\n")
+                        misconfigured = True
+
+            multitrans_present = False
+            for path in os.environ["PATH"].split(os.pathsep):
+                if os.path.isfile(os.path.join(path, 'multitrans')):
+                    multitrans_present = True
+                    break
+
+            if not multitrans_present:
+                print(
+                    f"multitrans is not installed, re-install apertium-lex-tools {apertium_url}\n")
+                misconfigured = True
+
+            ranker_present = False
+            for path in os.environ["PATH"].split(os.pathsep):
+                if os.path.isfile(os.path.join(path, 'irstlm-ranker')):
+                    ranker_present = True
+                    break
+
+            if not ranker_present:
+                print(
+                    f"irstlm-ranker is not installed, re-install apertium-lex-tools with irstlm {apertium_url}\n")
+                misconfigured = True
+
+            # if not 'IRSTLM' in os.environ:
+            #     print(
+            #         f"IRSTLM is either not installed or not defined as an environment variable, see {irstlm_url}\n")
+            #     misconfigured = True
+
+            irstlm_present = False
+            for path in os.environ["PATH"].split(os.pathsep):
+                if os.path.isfile(os.path.join(path, 'build-lm.sh')):
+                    irstlm_present = True
+                    break
+
+            if not irstlm_present:
+                print(
+                    f"'build-lm.sh' is not installed or added to path, see {irstlm_url}\n")
+                misconfigured = True
+
+            is_lex_tools_present = False
+            for lex_tools in lex_tools_paths:
+                if os.path.isdir(lex_tools):
+                    scripts = ['biltrans-extract-frac-freq.py', 'extract-alig-lrx.py',
+                               'biltrans-count-patterns-ngrams.py', 'ngram-pruning-frac.py', 'ngrams-to-rules.py',
+                               'biltrans_count_common.py', 'common.py']
+
+                    for script in scripts:
+                        if not os.path.isfile(os.path.join(lex_tools, script)):
+                            print(
+                                f"'{script}' is not present in '{lex_tools}', re-install apertium-lex-tools {apertium_url}\n")
+                            misconfigured = True
+                    is_lex_tools_present = True
+
+            if not is_lex_tools_present:
+                print(
+                    f"'apertium_lex_tools' is not installed, to install apertium-lex-tools follow {apertium_url}\n")
+                misconfigured = True
 
     if misconfigured:
         exit(1)
+    else:
+        print("prerequisites are properly installed")
 
     return config
 
 
 if __name__ == '__main__':
-    if(len(sys.argv)==2):
-        check_config(sys.argv[1])
+    config_file = 'config.toml'
+    if(len(sys.argv) == 2):
+        config_file = sys.argv[1]
+    check_config(config_file)
diff --git a/config.toml.example b/config.toml.example
index ed7ea17..a471413 100644
--- a/config.toml.example
+++ b/config.toml.example
@@ -3,10 +3,10 @@
 # corpus name
 CORPUS = "europarl-v7"
 
-# source language
+# source language[it should match with the language codes of apertium]
 SL = "eng"
 
-# target language
+# target language[it should match with the language codes of apertium]
 TL = "spa"
 
 # source corpus
@@ -18,11 +18,14 @@ CORPUS_TL = "europarl-v7.eng-spa.spa"
 # apertium-lex-tools scripts
 # LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
 
-# fast align build folder[not required for non-parallel training]
-FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build"
-
 # apertium language data
 LANG_DATA = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/apertium-eng-spa"
 
-# number of lines to be trained on
+# number of lines to be trained on (do not enclose in quotes)
 TRAINING_LINES = 100000
+
+# parallel(true) or non-parallel corpora(false)
+IS_PARALLEL = true
+
+# fast align build folder[not required for non-parallel training]
+FAST_ALIGN = "/home/vivek/Documents/FOSS/apertium/user-friendly-lexical-training/coding_challenges/fast_align/build"
diff --git a/lexical_selection_training.py b/lexical_selection_training.py
index 50df75f..c6465a9 100644
--- a/lexical_selection_training.py
+++ b/lexical_selection_training.py
@@ -64,7 +64,7 @@ def pipe(cmds, firstin, lastout, stderr):
     return procs[-1]
 
 
-def training(config, cache_dir, log):
+def parallel_training(config, cache_dir, log):
 
     MIN = 1
 
@@ -299,13 +299,21 @@ def training(config, cache_dir, log):
     #     ngrams_to_rules(ngrams_all)
 
 
+def non_parallel_training(config, cache_dir, log):
+    pass
+
+
 def main(config_file):
     print("validating configuration....")
     config = check_config(config_file)
 
-    # adding lex scripts to path
-    lex_tools = '/home/vivek/Documents/FOSS/apertium/lex-tools/scripts'
-    sys.path.insert(1, lex_tools)
+    # appending lex scripts' paths to environment path
+    sys.path.insert(0, '/usr/share/apertium-lex-tools')
+    sys.path.insert(0, '/opt/local/share/apertium-lex-tools')
+    sys.path.insert(0, '/usr/local/share/apertium-lex-tools')
+
+    # remove after testing
+    sys.path.insert(0, '/home/vivek/Documents/FOSS/apertium/lex-tools/scripts')
 
     # cleaning the parallel corpus i.e. removing empty sentences, sentences only with '*', '.', or '°'
     print("cleaning corpus....")
@@ -313,6 +321,8 @@ def main(config_file):
 
     cache_dir = f"cache-{config['CORPUS']}-{config['SL']}-{config['TL']}"
 
+    if not config['IS_PARALLEL']:
+        cache_dir = cache_dir + '-np'
     # the directory where all the intermediary outputs are stored
     if os.path.isdir(cache_dir):
         if not query(f"Do you want to overwrite the files in '{cache_dir}'"):
@@ -325,7 +335,10 @@ def main(config_file):
     log = os.path.join(cache_dir, "training.log")
 
     with open(log, 'a') as log_file:
-        training(config, cache_dir, log_file)
+        if config['IS_PARALLEL']:
+            parallel_training(config, cache_dir, log_file)
+        else:
+            non_parallel_training(config, cache_dir, log_file)
     print("training complete!!")
 
 
diff --git a/tests/training/config-np.toml b/tests/training/config-np.toml
new file mode 100644
index 0000000..59b6af6
--- /dev/null
+++ b/tests/training/config-np.toml
@@ -0,0 +1,25 @@
+# configuration for lexical training
+
+# corpus name
+CORPUS = "europarl-v7"
+
+# source language[it should match with the language codes of apertium]
+SL = "eng"
+
+# target language[it should match with the language codes of apertium]
+TL = "spa"
+
+# source corpus
+CORPUS_SL = "tests/training/test.eng"
+
+# target corpus
+CORPUS_TL = "tests/training/test.spa"
+
+# apertium language data
+LANG_DATA = "apertium-eng-spa"
+
+# number of lines to be trained on (do not enclose in quotes)
+TRAINING_LINES = 100
+
+# parallel(true) or non-parallel corpora(false)
+IS_PARALLEL = false
diff --git a/tests/training/config.toml b/tests/training/config.toml
index 30076f2..1ac5762 100644
--- a/tests/training/config.toml
+++ b/tests/training/config.toml
@@ -3,23 +3,29 @@
 # corpus name
 CORPUS = "europarl-v7"
 
-# source language
-SL = "spa"
+# source language[it should match with the language codes of apertium]
+SL = "eng"
 
-# target language
-TL = "eng"
+# target language[it should match with the language codes of apertium]
+TL = "spa"
 
 # source corpus
-CORPUS_SL = "tests/training/test.spa"
+CORPUS_SL = "tests/training/test.eng"
 
 # target corpus
-CORPUS_TL = "tests/training/test.eng"
+CORPUS_TL = "tests/training/test.spa"
 
-# fast align
-FAST_ALIGN = "fast_align/build/fast_align"
+# apertium-lex-tools scripts
+# LEX_TOOLS = "/home/vivek/Documents/FOSS/apertium/apertium-lex-tools/scripts"
 
 # apertium language data
 LANG_DATA = "apertium-eng-spa"
 
 # number of lines to be trained on (do not enclose in quotes)
 TRAINING_LINES = 100
+
+# parallel(true) or non-parallel corpora(false)
+IS_PARALLEL = true
+
+# fast align build folder[not required for non-parallel training]
+FAST_ALIGN = "fast_align/build/fast_align"