docx.py 2.62 KB
Newer Older
1 2 3
# Lesson Planalyzer
# Copyright (C) 2019  Sam Thursfield <sam@afuera.me.uk>
#
4 5
# Lesson Planalyzer is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
6 7 8
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
9
# Lesson Planalyzer is distributed in the hope that it will be useful,
10 11 12 13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
14 15
# You should have received a copy of the GNU Affero General Public License
# along with Lesson Planalyzer.  If not, see <https://www.gnu.org/licenses/>.
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32


"""
Helper functions for reading Microsoft word .docx files.
"""


import xml.etree.ElementTree
import zipfile


WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
PARA_PROPERTIES = WORD_NAMESPACE + 'pPr'
PARA_STYLE = WORD_NAMESPACE + 'pStyle'
NUMBERED_PARAGRAPH = WORD_NAMESPACE + 'numPr'
TEXT = WORD_NAMESPACE + 't'
33
TAB = WORD_NAMESPACE + 'tab'
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
TABLE = WORD_NAMESPACE + 'tbl'
ROW = WORD_NAMESPACE + 'tr'
CELL = WORD_NAMESPACE + 'tc'
VAL = WORD_NAMESPACE + 'val'


def load_docx(file):
    '''Returns the XML document contents from a Microsoft Word .docx file.'''
    # In theory, the ZipFile module accepts "file-like" objects. In practice,
    # I get this error when passing in a file handle that we received from
    # argparse:
    #
    #    zipfile.BadZipFile: File is not a zip file
    #
    with zipfile.ZipFile(file) as docx:
        tree = xml.etree.ElementTree.XML(docx.read('word/document.xml'))
    return tree


def all_paragraphs(docx):
    '''Return all paragraph elements from a parsed .docx file.'''
    return docx.iter(PARA)


def has_heading_style(paragraph):
    properties = paragraph.find(PARA_PROPERTIES)
60 61
    if properties is None:
        return False
62 63 64 65 66 67 68 69 70 71
    style = properties.find(PARA_STYLE)
    if style is None:
        return False
    else:
        return style.attrib[VAL].startswith('Heading')


def has_bullets_or_numbers(paragraph):
    '''Returns True if the given paragraph is part of a bullet or numbered list.'''
    properties = paragraph.find(PARA_PROPERTIES)
72 73
    if properties is None:
        return False
74 75 76 77 78 79 80
    if properties.find(NUMBERED_PARAGRAPH) is None:
        return False
    else:
        return True


def get_text(paragraph):
81 82 83 84 85 86 87 88
    tags = paragraph.iter()
    text = []
    for t in tags:
        if t.tag == TEXT:
            text.append(t.text)
        elif t.tag == TAB:
            text.append('\t')
    return ''.join(text)