Index: branches/apertium-separable/testing/testfile.txt =================================================================== --- branches/apertium-separable/testing/testfile.txt (nonexistent) +++ branches/apertium-separable/testing/testfile.txt (revision 80041) @@ -0,0 +1,92 @@ + +---- SHOULD ALL BE SUCCESSFUL ---- +(1) n* +^take$ ^birdog$ ^out$ +^take$ ^birdog$ ^out$ +^take$ ^birdog$ ^out$ +^take$ ^birdog$ ^out$ +^take$ ^birdog$ ^out$ +^take$ ^birdog$ ^out$ +^take$ ^birdog$ ^out$ +^take$ ^birdog$ ^out$ +^take$ ^birdog$ ^out$ +^take$ ^birdog$ ^out$ + +(2.1) adj n* +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ + +(2.2) adj* n* +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ +^take$ ^big$ ^birdog$ ^out$ + +(2.3) adj* adj* (adj* ...) n* +^take$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^big$ ^round$ ^birdog$ ^out$ + +(3.1) det* n* +^take$ ^my$ ^birdog$ ^out$ +^take$ ^my$ ^birdog$ ^out$ +^take$ ^my$ ^birdog$ ^out$ +^take$ ^my$ ^birdog$ ^out$ +^take$ ^my$ ^birdog$ ^out$ +^take$ ^my$ ^birdog$ ^out$ +^take$ ^my$ ^birdog$ ^out$ +^take$ ^my$ ^birdog$ ^out$ +^take$ ^my$ ^birdog$ ^out$ +^take$ ^my$ ^birdog$ ^out$ + +(3.2) det* adj(*) n* +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ +^take$ ^my$ ^big$ ^round$ ^birdog$ ^out$ + +(4.1) prn.pers.* +^take$ ^prpers$ ^out$ +^take$ ^prpers$ ^out$ +^take$ ^prpers$ ^out$ +^take$ ^prpers$ ^out$ +^take$ ^prpers$ ^out$ +^take$ ^prpers$ ^out$ +^take$ ^prpers$ ^out$ +^take$ ^prpers$ ^out$ +^take$ ^prpers$ ^out$ +^take$ ^prpers$ ^out$ + +---- COUNTEREXAMPLES: NONE OF THE FOLLOWING SHOULD BE ACCEPTED ---- +^take$ ^birdog$ ^out$ +^big$ ^birdog +... +#incorrect tag count +#incorrect word phrase Index: branches/apertium-separable/testing/transducer.py =================================================================== --- branches/apertium-separable/testing/transducer.py (revision 80040) +++ branches/apertium-separable/testing/transducer.py (revision 80041) @@ -1,13 +1,33 @@ -import sys, re +#usage: python transducer.py testfile.txt +import sys + +""" +noun phrase acceptor: + n.* + adj n.* + adj.* n + adj.* adj.* n.* + det.* n.* + det.* adj n.* +prn.pers.* +prn.dem.* +np* +""" + transitions = { - (0,'^') : 1, - (1,'t') : 2, - (2,'a') : 3, - (3,'k') : 4, - (4,'e') : 5, - (5,'') : 6, + #if current_state is -1 and the next token (the next thing to print) + #is '^', then next_state() will print the next token, + #which is located at state[0], and set current_state to 0 + (-1,'^') : 0, + (0,'t') : 1, + (1,'a') : 2, + (2,'k') : 3, + (3,'e') : 4, + (4,'') : 5, + (5,'') : 6, (6,'') : 7, + (6,'$') : 8, (7,'') : 7, (7,'$'): 8, (8,' ') : 9, @@ -14,30 +34,58 @@ (9,'^') : 10, (10,'&') : 11, (11,'&') : 11, - (11,'') : 12, - (11,'') : 13, - (11,'') : 14, + (11,'') : 12, #if noun, there should be one or more add'l tags but no following words + (11,'') : 13, #if adj, add'l tags are optional and should be followed by an n + (11,'') : 14, (11,'') : 15, - (12,'') : 16, - (13,'') : 16, - (14,'') : 16, - (15,'') : 16, - (16,'') : 16, - (16,'$') : 17, - (17,' ') : 18, + # (12,'') : 16, #### case: n* + (12,'') : 200, + (200,'') : 201, + (200,'$') : 17, + (201,'') : 201, + (201,'$') : 17, + # (13,'') : 13, #### case: adj(*) n* + (13,'') : 225, + (13,'$') : 250, + (225,'') : 225, + (225,'$') : 250, #followed by noun + (250,' '):251, + (251,'^'):252, + (252,'&'):253, + (253,'&'):253, + (253,''):12, + (253,''):13, + # (14,'') : 16, + (14,'') : 275, + (275,'') : 276, + (275,'$') : 250, + (276,'') : 276, + (276,'$') : 250, + # (15,'') : 16, #prn.pers same as n + (15,'') : 200, + (16,'') : 100, + (16, '$') : 17, + (100,'') : 100, + (100,'$') : 17, + (17,' ') : 18, #do not go to state 17 unless you are expecting 'out' to be the next word (18,'^') : 19, #? - (19,'&') : 11, + # (19,'&') : 11, (19,'o') : 20, (20,'u') : 21, (21,'t') : 22, (22,'') : 23, - (23,'') : 24, + (22,'') : 24, (23,'$') : 25, (24,'$') : 25, + (25,'') : 26, + (25,' ') : 26, (25,'\n') : 26 } +# is required +# is optional states = { + -1 : '', 0 : '^', 1 : 't', 2 : 'a', @@ -44,17 +92,27 @@ 3 : 'k', 4 : 'e', 5 : '', - 6 : '', # - 7 : '', #the second one + 6 : '', #secondary tag is necessary + 7 : '', #third, fourth, fifth...tags are optional 8 : '$', 9 : ' ', 10 : '^', - 11 : '&', #'ANY_CHAR', # - 12 : '', + 11 : '&', #represents any character 'ANY_CHAR + 12 : '', 13 : '', - 14 : '', + 14 : '', 15 : '', - 16 : '', + 16 : '', + 100: '', + 200: '', + 201: '', + 225: '', + 250: '$', + 251: ' ', + 252: '^', + 253: '&', + 275: '', + 276: '', 17 : '$', 18 : ' ', 19 : '^', @@ -64,22 +122,19 @@ 23 : '', 24 : '', 25 : '$', - 26 : '\n', + 26 : '\n' } -def next_token(first_tag_passed, in_lemma, in_take, in_out): - token = sys.stdin.read(1) - # print 'next_token' + token +def next_token(file, subsequent_tag, in_lemma, in_take, in_out): + token = file.read(1) if token == '<': #if in tag in_lemma = False c = '' while c != '>': - c = sys.stdin.read(1) + c = file.read(1) token += c - if first_tag_passed: + if subsequent_tag: token = '' - # first_tag_passed = True - # print in_lemma, in_take, in_out if in_lemma and not in_take and not in_out: # print in_lemma, in_take, in_out token = '&' #ANY_CHAR @@ -87,50 +142,79 @@ def step(state, token): #token is at the next state next_state = transitions.get((state,token)) + # if next_state == None: + # print('error: (current_state,token) pair not found in transitions. ' + str((state,token))) + # #acceptor: if not found in transitions, exit and do not reorder + # exit(1) + # elif next_state == 26: + # print ('successful parsing of line') # print('successful termination') + # exit(0) + # print states[next_state] + str(next_state) + output_token = states.get(next_state) + return next_state, output_token #return the next state, or None if it doesn't exist - if next_state == None: - # print(str(state), str(current_state), str(token)) - print('error: (current_state,token) pair not found in transitions. ' + str(current_state) + str(token)) - exit(1) - elif next_state == 25: - print('successful termination') - exit(0) +def main(): + f = open(sys.argv[1]) + # print('input a string:') + # eol = True + line_number = 0 + accepted = True + while True: #while eol: + # eol = False + line = '' + if accepted: + line_number += 1 + current_state = -1 - print states[next_state] - # print('inside step(): printing ' + states[next_state] + ' current state ' + str(next_state)) #prints the prev token - return next_state #transitions.get((state,token)) #return the next state, or None if it doesn't exist - -def main(): - print('input a string:') - current_state = 0 - first_tag_passed = False + subsequent_tag = False in_lemma = False in_take = False in_out = False - # token = next_token(first_tag_passed, in_lemma, in_take, in_out) - while states.get(current_state) != None: - token = next_token(first_tag_passed, in_lemma, in_take, in_out) - # print 'before step(): token = ' + token + ' current_state = ' + str(current_state) - next_state = step(current_state, token) + while states.get(current_state) != None and current_state != 26: + token = next_token(f, subsequent_tag, in_lemma, in_take, in_out) + if current_state == -1 and token == '': + print('successfully reached end of file') + exit(0) + elif current_state == -1 and token == '\n': + accepted = True + break + elif not accepted and token == '\n': + accepted = True + next_state, output_token = step(current_state, token) + if output_token == None: + break - first_tag_passed = next_state in [6, 7, 16, 12, 13, 14, 15, 16] #out not included - in_lemma = next_state in [1, 2, 3, 4, 10, 11, 20, 21, 22] #take and out don't need to be included? + line += output_token + + subsequent_tag = next_state in [5, 6, 7, 12, 13, 14, 15, 16, 100, 200, 201, 225, 275, 276] #every state that is a tag. secondary tags for 'out' not included because it only ever has one tag + in_lemma = next_state in [1, 2, 3, 10, 11, 252, 253, 19, 20, 21, 22] #include 4? do not include 22? in_take = next_state in [1, 2, 3, 4] - # in_out = sys.stdin.read(4) == 'out<' - in_out = next_state in [19, 20, 21] #-21 #should be: if peek(sys.stdin.read(3) == 'out<' + # print 'position: ' + str(f.tell()) + if next_state == 19: + #in c: there is an istream::peek() function + pos = f.tell() #store the current buffer position + peek = f.read(4) #read in the next 4 chars + f.seek(pos) #go back to the original position + if peek == 'out<': + in_out = True + # print 'subsequent_tag: ' + str(subsequent_tag) + ' in_lemma: ' + str(in_lemma) + ' in_take: ' + str(in_take) + ' in_out: ' + str(in_out) + # print '' + #TODO: when transitions are finalized, check indices - current_state = next_state - # print first_tag_passed, in_lemma, in_take, in_out - # print 'token' + token - # print str(current_state), str(states.get(current_state)) - print('error: current_state ' + state + ' not found in states') - exit(0) + current_state = next_state #can't set this earlier, or else the following print statement doesn't work + if current_state == 26: + print str(line_number) + ' ' + line + accepted = True + else: + # print('error: current_state ' + str(current_state) + ' not found in states') + # exit(1) + if accepted: + print str(line_number) + ' string not accepted \n' + accepted = False + current_state = -1 + line_number += 1 + # eol = True -#^take$ ^ccccc$ ^cccc$ ^out$ -#^take$ ^the$ ^thing$ ^out$ -#^take$ ^thing$ ^out$ -#^take$ ^thing$ ^out$ - if __name__ == '__main__': main() \ No newline at end of file