Commit 31211020 authored by Erik Hetzner's avatar Erik Hetzner

Allow for unicode quotes in words

parent f19de0d7
......@@ -168,5 +168,18 @@ class TestXciteParserp(unittest.TestCase):
CitationCluster([CitationInfo(citekey=item1,
locator="p. 30")]))
def test_parse_unicode_quotes(self):
UNICODE_PUNCT_FINAL=u'\u00BB\u2019\u201D\u203A\u2E03\u2E05\u2E0A\u2E0D\u2E1D\u2E21'
UNICODE_PUNCT_INITIAL=u'\u00AB\u2018\u201B\u201C\u201F\u2039\u2E02\u2E04\u2E09\u2E0C\u2E1C\u2E20'
item1 = self.mk_citekey()
suffix = "%sfoo%s"%(random.choice(UNICODE_PUNCT_INITIAL),
random.choice(UNICODE_PUNCT_FINAL))
[first_cluster, second_cluster] = self.parse("[@%s %s]"%(item1, suffix))
self.assertEqual(first_cluster, None)
self.assertEqual(second_cluster,
CitationCluster([CitationInfo(citekey=item1,
suffix=suffix)]))
if __name__ == '__main__':
unittest.main()
......@@ -81,7 +81,9 @@ class CiteParser(object):
return cites
def parse(self, what):
WORD_CHAR_RE = r'[\w.,\'\"\(\)</>-]'
UNICODE_PUNCT_FINAL=ur'\u00BB\u2019\u201D\u203A\u2E03\u2E05\u2E0A\u2E0D\u2E1D\u2E21'
UNICODE_PUNCT_INITIAL=ur'\u00AB\u2018\u201B\u201C\u201F\u2039\u2E02\u2E04\u2E09\u2E0C\u2E1CU+2E20'
WORD_CHAR_RE = r'[\w.,\'\"\(\)</>%s%s-]'%(UNICODE_PUNCT_INITIAL, UNICODE_PUNCT_FINAL)
CITEKEY_RE = r'\w[\w\(:.#\$%&+?<>~/\)-]+'
greedyToken = Regex(r'%s+'%(WORD_CHAR_RE))
wordWithDigits = Regex(r'%s*[0-9]%s*'%(WORD_CHAR_RE, WORD_CHAR_RE))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment