WIP

parent 05a82060
......@@ -5,10 +5,10 @@
#
# File : jiten/freq.py
# Maintainer : Felix C. Stegerman <flx@obfusk.net>
# Date : 2020-06-19
# Date : 2020-08-19
#
# Copyright : Copyright (C) 2020 Felix C. Stegerman
# Version : v0.0.1
# Version : v0.3.0
# License : AGPLv3+
#
# -- ; }}}1
......@@ -59,7 +59,7 @@ def parse_freq(file, word_first): # {{{1
fields = line.split("\t")[:2]
word, freq = fields if word_first else fields[::-1]
freq = int(freq)
if word and (word in EXCEPTIONS or all(map(M.isokjap, word))):
if word and (word in EXCEPTIONS or M.isokjap(word)):
assert word not in data or data[word] == freq
data[word] = freq
return data
......
......@@ -5,7 +5,7 @@
#
# File : jiten/jmdict.py
# Maintainer : Felix C. Stegerman <flx@obfusk.net>
# Date : 2020-08-16
# Date : 2020-08-19
#
# Copyright : Copyright (C) 2020 Felix C. Stegerman
# Version : v0.3.0
......@@ -459,6 +459,7 @@ def load_entry(c, seq): # {{{1
return Entry(seq, *( tuple(x) for x in [k, r, s] ))
# }}}1
# TODO
def search(q, langs = [LANGS[0]], max_results = None, # {{{1
noun = False, verb = False, prio = False,
file = SQLITE_FILE):
......@@ -483,7 +484,7 @@ def search(q, langs = [LANGS[0]], max_results = None, # {{{1
ORDER BY prio DESC, rank ASC, seq ASC
{}
""".format(fltr, limit), (ord(q),)) # safe!
elif all( M.iscjk(c) for c in q ):
elif M.iscjk(q):
query = ("""
SELECT rank, seq FROM (
SELECT entry FROM kanji WHERE elem LIKE :q
......@@ -495,11 +496,9 @@ def search(q, langs = [LANGS[0]], max_results = None, # {{{1
ORDER BY prio DESC, rank ASC, seq ASC
{}
""".format(fltr, limit), dict(q = "%"+q+"%")) # safe!
elif M.likeable(q):
elif M.q2like(q):
load_pcre_extension(c.connection)
q2 = "".join( "_" if not M.iscjk(c) and not M.isascii(c)
else c for c in M.without_e1w(q) )
prms = dict(q = "%"+q2+"%", re = M.q2rx(q))
prms = dict(q = M.q2like(q), re = M.q2rx(q))
query = ("""
SELECT rank, seq FROM (
SELECT entry FROM kanji WHERE
......@@ -507,7 +506,7 @@ def search(q, langs = [LANGS[0]], max_results = None, # {{{1
UNION
SELECT entry FROM reading WHERE
elem LIKE :q AND elem REGEXP :re
""" + ("" if all( M.iscjk(c) for c in q2 ) else """
""" + ("" if M.iscjk(M.without_e1w(q)) else """
UNION
SELECT entry FROM sense WHERE
lang IN ({}) AND gloss LIKE :q AND gloss REGEXP :re
......
......@@ -5,10 +5,10 @@
#
# File : jiten/kanji.py
# Maintainer : Felix C. Stegerman <flx@obfusk.net>
# Date : 2020-08-08
# Date : 2020-08-19
#
# Copyright : Copyright (C) 2020 Felix C. Stegerman
# Version : v0.2.0
# Version : v0.3.0
# License : AGPLv3+
#
# -- ; }}}1
......@@ -87,9 +87,9 @@ Entry(char='日', cat='KANJI', level='常用1', strokes=4, freq=1, jlpt=4, skip=
>>> len(RADICALS)
214
>>> all( M.iskanji(c) for c in KAN2RAD.keys() )
>>> M.iskanji(KAN2RAD.keys())
True
>>> all( M.isradical(c) for c in RAD2KAN.keys() )
>>> M.isradical(RAD2KAN.keys())
True
>>> set( ord(c) - 0x2f00 for c in RAD2KAN.keys() ) == set(range(214))
True
......@@ -215,7 +215,7 @@ def parse_kanjidic(kanjivg = None, file = KANJIDIC_FILE): # {{{1
assert 1 <= rad <= 214
assert all( M.iskatakana(c) or c in ".-" for x in on for c in x )
assert all( all( M.ishiragana(c) or c in ".-ー" for c in x ) or
all( M.iskatakana(c) for c in x ) for x in kun )
M.iskatakana(x) for x in kun )
assert all( "\n" not in x for x in on )
assert all( "\n" not in x for x in kun )
assert all( "\n" not in x for x in nanori )
......
......@@ -5,7 +5,7 @@
#
# File : jiten/misc.py
# Maintainer : Felix C. Stegerman <flx@obfusk.net>
# Date : 2020-08-16
# Date : 2020-08-19
#
# Copyright : Copyright (C) 2020 Felix C. Stegerman
# Version : v0.3.0
......@@ -38,31 +38,45 @@ True
>>> list(uniq([1, 2, 3, 1, 4, 2, 2]))
[1, 2, 3, 4]
>>> q2like(r"+w foo")
'%foo%'
>>> q2like(r".foo.*bar[a-z]baz")
'%_foo%bar_baz%'
>>> q2like(r"[^あいうえお]")
'%_%'
>>> q2like(r"猫\pK{2}\d\S")
'%猫%__%'
""" # }}}1
import itertools, os, sys
import itertools, re, os, sys
class RegexError(RuntimeError): pass
OKPUNC = "々"
ispunc = lambda c: 0x3000 <= ord(c) <= 0x303f
ishiragana = lambda c: 0x3040 <= ord(c) <= 0x309f
iskatakana = lambda c: 0x30a0 <= ord(c) <= 0x30ff
ispunc1 = lambda c: 0x3000 <= ord(c) <= 0x303f
ishiragana1 = lambda c: 0x3040 <= ord(c) <= 0x309f
iskatakana1 = lambda c: 0x30a0 <= ord(c) <= 0x30ff
iskanji = lambda c: 0x4e00 <= ord(c) <= 0x9fff
iscompat = lambda c: 0xf900 <= ord(c) <= 0xfaff
isuniext = lambda c: 0x3400 <= ord(c) <= 0x4dbf or \
iskanji1 = lambda c: 0x4e00 <= ord(c) <= 0x9fff
iscompat1 = lambda c: 0xf900 <= ord(c) <= 0xfaff
isuniext1 = lambda c: 0x3400 <= ord(c) <= 0x4dbf or \
0x20000 <= ord(c) <= 0x2ebef
isradical = lambda c: 0x2e80 <= ord(c) <= 0x2eff or \
isradical1 = lambda c: 0x2e80 <= ord(c) <= 0x2eff or \
0x2f00 <= ord(c) <= 0x2fdf
iskana = lambda c: ishiragana(c) or iskatakana(c)
isideo = lambda c: iskanji(c) or iscompat(c) or isuniext(c)
isjap = lambda c: iskanji(c) or iskana(c) # TODO
isokjap = lambda c: isjap(c) or c in OKPUNC # TODO
iscjk = lambda c: isideo(c) or iskana(c) or ispunc(c) # TODO
iskana1 = lambda c: ishiragana1(c) or iskatakana1(c)
isideo1 = lambda c: iskanji1(c) or iscompat1(c) or isuniext1(c)
isjap1 = lambda c: iskanji1(c) or iskana1(c) # TODO
isokjap1 = lambda c: isjap1(c) or c in OKPUNC # TODO
iscjk1 = lambda c: isideo1(c) or iskana1(c) or ispunc1(c) # TODO
for _n, _f in list(locals().items()):
if _n.startswith("is") and _n.endswith("1"):
locals()[_n[:-1]] = (lambda f: lambda s: all(map(f, s)))(_f)
del _n, _f
isascii = getattr(str, "isascii",
lambda s: all( ord(c) < 128 for c in s ))
......@@ -83,23 +97,38 @@ def process_query(q, word, exact, fstwd):
if not q: return ""
q = q.strip()
if word or exact or fstwd: q = without_e1w(q)
if q.startswith("+"): return q
if exact: return "+= " + q
if fstwd: return "+1 " + q
if word : return "+w " + q
if q.startswith("+~"): return q[2:].lstrip()
if not q.startswith("+"):
if exact: return "+= " + q
if fstwd: return "+1 " + q
if word : return "+w " + q
return q
def without_e1w(q):
return q[2:].lstrip() if any( q.startswith("+"+x) for x in "=1w" ) else q
def likeable(q):
return all( c not in ".^$*+?{}[]\\|()%_" for c in without_e1w(q) )
LIKERX = re.compile("(" + "|".join([
r"\.", r"\[\^?\]?[^]]*\]", r"\\[dDsSwW]", r"\\p[khK]",
r"\\[pP]\{\w+\}"
]) + r")(([+*]|\{\d+(,\d+)?\})?)|[^^$*+?{}\|()%_]")
def q2like(q):
f = lambda c: "_" if not isascii(c) and c.upper() != c.lower() else c
q, p = without_e1w(q), ""
while q:
m = LIKERX.match(q)
if not m: return None
p += "%" if m[2] else "_" if m[1] else f(m[0])
q = q[m.end():]
return re.sub(r"%%+", "%", "%" + p + "%")
def q2rx(q):
if q.startswith("+="): q = "^" + q[2:].lstrip() + "$"
elif q.startswith("+1"): q = "^" + q[2:].lstrip() + "\\b"
elif q.startswith("+w"): q = "\\b" + q[2:].lstrip() + "\\b"
return "(?im)" + q
return "(?im)" + q.replace(r"\pk", r"\p{Katakana}") \
.replace(r"\ph", r"\p{Hiragana}") \
.replace(r"\pK", r"\p{Han}") # TODO
if __name__ == "__main__":
if "--doctest" in sys.argv:
......
......@@ -5,10 +5,10 @@
#
# File : jiten/sentences.py
# Maintainer : Felix C. Stegerman <flx@obfusk.net>
# Date : 2020-08-12
# Date : 2020-08-19
#
# Copyright : Copyright (C) 2020 Felix C. Stegerman
# Version : v0.2.0
# Version : v0.3.0
# License : AGPLv3+
#
# -- ; }}}1
......@@ -115,7 +115,7 @@ def search(q, langs = [], max_results = None, audio = False,
for r in c.execute("SELECT * FROM entry WHERE id = ?", (id,)):
yield Entry(*r) # #=1
else:
sel = ["jap"] + ([] if all( M.iscjk(c) for c in q ) else LANGS)
sel = ["jap"] + ([] if M.iscjk(q) else LANGS)
s = " OR ".join( x + " LIKE :q" for x in sel )
for r in c.execute("""
SELECT * FROM entry WHERE ({}) {} {} ORDER BY id {}
......
......@@ -2,10 +2,10 @@
File : templates/_nav.html
Maintainer : Felix C. Stegerman <flx@obfusk.net>
Date : 2020-08-12
Date : 2020-08-19
Copyright : Copyright (C) 2020 Felix C. Stegerman
Version : v0.2.0
Version : v0.3.0
License : AGPLv3+
--> #}
......@@ -193,6 +193,17 @@
target="_blank" rel="noopener">PCRE Syntax</a>
(what jiten actually uses).
<hr/>
<h6>Prefix "Commands"</h6>
Queries support prefix "commands" unrelated to regex syntax:
e.g. <code>+w foo</code> (word) should give the same results
as <code>\bfoo\b</code>, <code>+1 foo</code> (1st word) as
<code>^foo\b</code>, and <code>+= foo</code> (exact) as
<code>^foo$</code>. Using these "commands" is often
significantly faster because they are handled separately,
which allows for certain optimisations. The <code>+</code>
prefix was chosen because no valid regex can start with a
<code>+</code>.
<hr/>
<h6>Quick Reference</h6>
<ul class="pl-4 mb-2"><li>
Most letters and characters will simply match themselves;
......@@ -274,25 +285,34 @@
<code>\p{...}</code> matches a unicode property; e.g.
<code>\p{Han}</code> matches kanji,
<code>\p{Hiragana}</code> matches hiragana, and
<code>\p{Katakana}</code> matches katakana; <code>\P</code>
is its complement.
<code>\p{Katakana}</code> matches katakana;
<code>\P{...}</code> is its complement.
</li><li>
For easy matching of Japanese, jiten supports these
non-standard aliases: <code>\pK</code> for
<code>\p{Han}</code>, <code>\ph</code> for
<code>\p{Hiragana}</code>, and <code>\pk</code> for
<code>\p{Katakana}</code>.
</li></ul>
<hr/>
<h6>Examples</h6>
<ul class="pl-4 mb-2"><li>
<code>\bcat\b</code> matches "cat" in "the cat" (but not in
e.g. "indicates").
<code>+w cat</code> (<code>\bcat\b</code>) matches "cat" in
"the cat" (but not in e.g. "indicates").
</li><li>
<code>^cat\b</code> matches "cat" in "cat" or "cat (esp. the
domestic cat, Felis catus)" (but not in e.g. "category").
<code>+1 cat</code> (<code>^cat\b</code>) matches "cat" in
"cat" or "cat (esp. the domestic cat, Felis catus)" (but not
in e.g. "category").
</li><li>
<code>^cat$</code> matches "cat" exactly.
<code>+= cat</code> (<code>^cat$</code>) matches "cat"
exactly.
</li></ul>
<ul class="pl-4 mb-2"><li>
<code>^猫\p{Han}$</code> matches "猫" followed by exactly
one other kanji.
<code>+= 猫\pK</code> (<code>^猫\pK$</code>) matches "猫"
followed by exactly one other kanji.
</li><li>
<code>^(\p{Han})\1$</code> matches e.g. "人人".
<code>+= (\pK)\1</code> (<code>^(\pK)\1$</code>) matches
e.g. "人人".
</li></ul>
</div>
</div>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment