mesh2cmdict.py 2.46 KB
#!/bin/python3

import sys, os, re, tempfile
from urllib import request
from lxml import etree

import argparse
parser = argparse.ArgumentParser(description=
"""
Create a ContentMine compatible dictionary from MeSH data.
The dictionary can be filtered with MeSH Tree numbers.
You can navigate the MeSH Tree at:
<https://meshb.nlm.nih.gov/#/treeSearch>
""")
parser.add_argument("output", help="write output to file, or stdout if an empty string")
parser.add_argument("--source", help="name of file containing the MeSH definitions (otherwise fetched)")
parser.add_argument("--match", help="regular expression to match MeSH tree numbers", default='')
args = parser.parse_args()


class CMDict():
    def __init__(self):
        self.terms = []
    def add(self, term, name, cui, tui):
        self.terms.append(dict(term=term, name=name, cui=cui, tui=tui))
    def to_xml(self):
        xml = str()
        xml += '<?xml version="1.0" encoding="UTF-8"?>\n'
        xml += '<dictionary title="mesh-2017-12">\n'
        xml += ''.join( '  <entry term="{term}" name="{name}" nameid="{cui}" termid="{tui}" />\n' \
                           .format(**i) for i in self.terms )
        xml += '</dictionary>\n'
        xml += '</xml>'
        return xml

def get_mesh(meshfile=None):
    if meshfile:
        t = etree.parse(meshfile)
    else:
        with tempfile.NamedTemporaryFile() as f:
            with request.urlopen('ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/xmlmesh/desc2017.gz') as g:
                f.write(g.read())
                t=etree.parse(f.name)
    return t

def convert_mesh(tree, cmdict, tree_re=''):
    r = tree.getroot()
    tree_re = re.compile(tree_re)
    for e_d in r.iterchildren('DescriptorRecord'):
        treenums=e_d.xpath('TreeNumberList/TreeNumber/text()')
        if not any(map(tree_re.match, treenums)):
            continue
        for e_c in e_d.iterdescendants('Concept'):
            cui=e_c.xpath('ConceptUI/text()')[0]
            name=e_c.xpath('ConceptName/String/text()')[0]
            for e_t in e_c.iterdescendants('Term'):
                tui=e_t.xpath('TermUI/text()')[0]
                term=e_t.xpath('String/text()')[0]
                cmdict.add(term, name, cui, tui)
               

if __name__ == '__main__':
    cmdict = CMDict()
    tree = get_mesh( args.source )
    convert_mesh(tree, cmdict, args.match )
    if args.output:
        with open(args.output, 'w') as f:
            f.write(cmdict.to_xml())
    else:
        print(cmdict.to_xml())