dictionary-to-thesaurus.py: Move blacklist to a separate file.

Change-Id: Ie05e0c0ce8b4f9541a5a143ddf9ccf960940a3b7
2025-08-20 14:12:37 +00:00 · 2016-02-25 14:35:03 +01:00
parent bd5a09adea
commit f83b25d29f
2 changed files with 35 additions and 16 deletions
--- a/cs_CZ/thesaurus/blacklist.txt
+++ b/cs_CZ/thesaurus/blacklist.txt
@ -0,0 +1,9 @@
 # Terms that are in the dictionary, but should be left out from thesaurus creation
 #
 # The words here are English Czech pairs, delimited by a TAB.  When one of
 # them is missing (is empty), it means "any".  Empty lines are ignored
 	?
 	(by the way)
 	(po)štvat
 	14. písmeno hebrejské abecedy
--- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py
+++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
@ -18,20 +18,13 @@ import os
 import re
 import sys
 # add here the Czech words we want to leave out from the thesaurus generation
 # (misbehaving, mistranslated, etc.)
 ignore_words = [
    '?',
    '(by the way)',
    '(po)štvat',
    '14. písmeno hebrejské abecedy',
 ]
 def usage():
-    message = """Usage: {program} slovnik_data_utf8.txt
+    message = """Usage: {program} slovnik_data_utf8.txt backlist.txt
-  slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php"""
+  slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php
-    print(message.format(program = os.path.basename(sys.argv[0])))
+  blacklist.txt:         List of words that should be ignored when generating
 """
    sys.stderr.write(message.format(program = os.path.basename(sys.argv[0])))
 def classify(typ):
    if typ == '':
@ -47,7 +40,18 @@ def classify(typ):
    return ''
-def parse(filename):
+def parse(filename, blacklistname):
    blacklist = {}
    with open(blacklistname, "r") as fp:
        for line in fp:
            if (line == ''):
                continue
            elif (line[0] == '#'):
                continue
            else:
                blacklist[line.strip(' \n')] = 1
    synonyms = {}
    meanings = {}
@ -73,7 +77,13 @@ def parse(filename):
                if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
                    word = word.strip('" ')
-                if (word == '' or word in ignore_words):
+                if (word == ''):
                    continue
                if (index + '\t' + word in blacklist or
                        index in blacklist or
                        index + '\t' in blacklist or
                        '\t' + word in blacklist):
                    continue
                typ = ''
@ -143,11 +153,11 @@ def buildThesaurus(synonyms, meanings):
                print line
 def main(args):
-    if (len(args) != 2):
+    if (len(args) != 3):
        usage()
        sys.exit(1)
-    (synonyms, meanings) = parse(args[1])
+    (synonyms, meanings) = parse(args[1], args[2])
    print "UTF-8"
    buildThesaurus(synonyms, meanings)