test_utils.py 3.75 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
import pathlib
import zipfile

import lxml
import pytest

from .utils import (
    characterize_cite,
    get_citation_style_info,
    read_article,
)

# root project directory
directory = pathlib.Path(__file__).resolve().parent.parent

"""
17 18 19
Test articles in the format (PMCID, citation style, comment). `failing` is
like `articles` except that those articles are expected to fail given the
current citation parsing implementation.
20 21
"""
articles = [
22 23
    ('PMC4878377', 'author', 'conventional authors like Ratajczak et al., 2006'),
    ('PMC5555280', 'author', 'conventional authors but only 3 cites total'),
24 25
    ('PMC4758082', 'author', '12274 cites, 3112 references'),
    ('PMC4413533', 'number', '3679 cites, 2857 references'),
26 27 28
    ('PMC5731425', 'number', 'cites like <sup>1</sup>'),
    ('PMC3294532', 'number', 'cites like <italic>5</italic>'),
    ('PMC3051456', 'unknown', 'cites like ▶'),
29 30 31
    ('PMC2660461', 'unknown', '12 blank cites, 9 numbered'),
    ('PMC2528962', 'number', 'mostly numbers like [23], but a few like [especially primates]'),
    ('PMC4799205', 'number', 'mixed between numbers like (7) and 7'),
32
]
33 34 35
failing = [
    ('PMC3906063', 'number', 'cites like [ref 42]'),
]
36 37 38 39 40 41 42 43


def extract_articles():
    """
    Extract testing articles to download/pmc-test-articles.
    """
    zip_path = directory.joinpath('download/pmc-articles-xml.zip')
    with zipfile.ZipFile(zip_path) as zip_file:
44
        for pmc, style, comment in articles + failing:
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
            root = read_article(zip_file, f'{pmc}.nxml')
            pretty = lxml.etree.tostring(root, pretty_print=True)
            path = directory.joinpath(f'download/pmc-test-articles/{pmc}.xml')
            path.write_bytes(pretty)


def read_xml_file(path):
    """
    Read and parse an uncompressed XML file from a path.
    """
    xml_bytes = pathlib.Path(path).read_bytes()
    root = lxml.etree.fromstring(xml_bytes)
    return root


@pytest.mark.parametrize('pmc,style,comment', articles)
def test_article(pmc, style, comment):
    path = directory.joinpath(f'download/pmc-test-articles/{pmc}.xml')
    if not path.exists():
        extract_articles()
    root = read_xml_file(path)
    info = get_citation_style_info(root)
    assert info['mode_style'] == style


70 71 72 73 74 75 76 77 78 79
@pytest.mark.parametrize('pmc,style,comment', failing)
@pytest.mark.xfail
def test_failing_article(pmc, style, comment):
    """
    These articles are expected to fail currently, and are instances
    to fix in the future.
    """
    test_article(pmc, style, comment)


80 81 82
@pytest.mark.parametrize('text,n_references, style', [
    ('1', 10, 'number'),
    ('[1]', 10, 'number'),
83 84
    ('2–4', 10, 'number'),
    ('(2–4)', 10, 'number'),
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
    ('1-3,7,9', 10, 'number'),
    ('1-3,[7,9]', 10, 'number'),
    ('[1]-[3],[7],[9]', 10, 'number'),
    ('<sup>1<sup>', 10, 'number'),
    ('[<italics>1<italics>]', 10, 'number'),
    ('Rogers, 2015', 10, 'author'),
    ('Zadikoff et al., 2007; Wu and Thijs, 2015; Hilton et al., 2004; Placidi et al., 2000; Jahromi et al., 2011', 10, 'author'),
    ('Zadikoff et al., 2007; Wu and Thijs, 2015; Hilton et al., 2004; Placidi et al., 2000; Jahromi et al., 2011', 3000, 'author'),
    ('Nakata et al. (1994)', 10, 'author'),
    ('Rogers et al., 2015', 10, 'author'),
    ('Rodgers, 2015', 3000, 'author'),
    ('Rogers et al.', 10, 'author'),
    ('2015', 10, 'author'),
    ('1500', 1499, 'author'),
    ('1500', 1500, 'number'),
    ('1500', 1501, 'number'),
    ('', 10, 'unknown'),
    ('<sup></sup>', 10, 'unknown'),
    ('[<sup></sup>]', 10, 'unknown'),
    ('[]', 10, 'unknown'),
    ('S10', 10, 'number'),
    ('▶', 10, 'unknown'),
    ('a', 10, 'unknown'),
    ('z', 10, 'unknown'),
    ('zi', 10, 'author'),
])
def test_characterize_cite(text, n_references, style):
    call = characterize_cite(text, n_references)
    assert call == style