mirror of
https://github.com/LibreOffice/dictionaries.git
synced 2025-08-20 14:12:37 +00:00
dictionary-to-thesaurus.py: Move blacklist to a separate file.
Change-Id: Ie05e0c0ce8b4f9541a5a143ddf9ccf960940a3b7
This commit is contained in:
9
cs_CZ/thesaurus/blacklist.txt
Normal file
9
cs_CZ/thesaurus/blacklist.txt
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# Terms that are in the dictionary, but should be left out from thesaurus creation
|
||||||
|
#
|
||||||
|
# The words here are English Czech pairs, delimited by a TAB. When one of
|
||||||
|
# them is missing (is empty), it means "any". Empty lines are ignored
|
||||||
|
|
||||||
|
?
|
||||||
|
(by the way)
|
||||||
|
(po)štvat
|
||||||
|
14. písmeno hebrejské abecedy
|
@ -18,20 +18,13 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
# add here the Czech words we want to leave out from the thesaurus generation
|
|
||||||
# (misbehaving, mistranslated, etc.)
|
|
||||||
ignore_words = [
|
|
||||||
'?',
|
|
||||||
'(by the way)',
|
|
||||||
'(po)štvat',
|
|
||||||
'14. písmeno hebrejské abecedy',
|
|
||||||
]
|
|
||||||
|
|
||||||
def usage():
|
def usage():
|
||||||
message = """Usage: {program} slovnik_data_utf8.txt
|
message = """Usage: {program} slovnik_data_utf8.txt backlist.txt
|
||||||
|
|
||||||
slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php"""
|
slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php
|
||||||
print(message.format(program = os.path.basename(sys.argv[0])))
|
blacklist.txt: List of words that should be ignored when generating
|
||||||
|
"""
|
||||||
|
sys.stderr.write(message.format(program = os.path.basename(sys.argv[0])))
|
||||||
|
|
||||||
def classify(typ):
|
def classify(typ):
|
||||||
if typ == '':
|
if typ == '':
|
||||||
@ -47,7 +40,18 @@ def classify(typ):
|
|||||||
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def parse(filename):
|
def parse(filename, blacklistname):
|
||||||
|
blacklist = {}
|
||||||
|
|
||||||
|
with open(blacklistname, "r") as fp:
|
||||||
|
for line in fp:
|
||||||
|
if (line == ''):
|
||||||
|
continue
|
||||||
|
elif (line[0] == '#'):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
blacklist[line.strip(' \n')] = 1
|
||||||
|
|
||||||
synonyms = {}
|
synonyms = {}
|
||||||
meanings = {}
|
meanings = {}
|
||||||
|
|
||||||
@ -73,7 +77,13 @@ def parse(filename):
|
|||||||
if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
|
if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
|
||||||
word = word.strip('" ')
|
word = word.strip('" ')
|
||||||
|
|
||||||
if (word == '' or word in ignore_words):
|
if (word == ''):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (index + '\t' + word in blacklist or
|
||||||
|
index in blacklist or
|
||||||
|
index + '\t' in blacklist or
|
||||||
|
'\t' + word in blacklist):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
typ = ''
|
typ = ''
|
||||||
@ -143,11 +153,11 @@ def buildThesaurus(synonyms, meanings):
|
|||||||
print line
|
print line
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
if (len(args) != 2):
|
if (len(args) != 3):
|
||||||
usage()
|
usage()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
(synonyms, meanings) = parse(args[1])
|
(synonyms, meanings) = parse(args[1], args[2])
|
||||||
|
|
||||||
print "UTF-8"
|
print "UTF-8"
|
||||||
buildThesaurus(synonyms, meanings)
|
buildThesaurus(synonyms, meanings)
|
||||||
|
Reference in New Issue
Block a user