Commit 7c51cf9a authored by Francesco Montanari's avatar Francesco Montanari

Initial commit

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
.static_storage/
.media/
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
This diff is collapsed.
arxiv2bib: Parse arxiv numbers and download the respective bibtex entries
Copyright (C) 2017 Francesco Montanari
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
* Description
Parse [[https://arxiv.org/][arXiv]] numbers from a text file. Fetch the respective bibtex
entries from [[https://inspirehep.net/][inspire]] and print then to standard output.
/Note:/ Queries are launched sequentially. Do not fetch several
entries in a small amount of time to avoid to inadvertently launch a
DOS (Denial of Service) attack.
* Installation
This program works with both Python 2 and 3. If the entries
contain Unicode characters, Python 3 is recommended.
* Usage
Let's assume that the file =refs.tex= contains a list of arXiv
numbers. For instance, this could be a list of =bibitem='s entries:
#+BEGIN_EXAMPLE
\bibitem{Aad:2012tfa}
G.~Aad {\it et al.} [ATLAS Collaboration],
%``Observation of a new particle in the search for the Standard Model Higgs boson with the ATLAS detector at the LHC,''
Phys.\ Lett.\ B {\bf 716} (2012) 1
doi:10.1016/j.physletb.2012.08.020
[arXiv:1207.7214 [hep-ex]].
\bibitem{Ade:2015xua}
P.~A.~R.~Ade {\it et al.} [Planck Collaboration],
%``Planck 2015 results. XIII. Cosmological parameters,''
Astron.\ Astrophys.\ {\bf 594} (2016) A13
doi:10.1051/0004-6361/201525830
[arXiv:1502.01589 [astro-ph.CO]].
#+END_EXAMPLE
(The format of the file is not relevant, only the arXiv numbers are
parsed. In this case =arXiv:1207.7214= and =arXiv:1502.01589=.)
Fetch the corresponding bibtex entries and print them on terminal:
#+BEGIN_SRC shell
arxiv2bib refs.tex
#+END_SRC
* Acknowledgments
The program depends on =pyinspire=, distributed together with this
package (no need to install it separately).
* Development
Checkout the repository. Run the code as:
#+BEGIN_SRC python
python3 -m arxiv2bib.arxiv2bib *args*
#+END_SRC
arxiv2bib: Parse arxiv numbers and download the respective bibtex entry
Copyright (C) 2017 Francesco Montanari
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
* Description
Parse [[https://arxiv.org/][arXiv]] numbers from a text file. Fetch the respective bibtex
entries from [[https://inspirehep.net/][inspire]] and print then to standard output.
/Note:/ Queries are launched sequentially. Do not fetch several
entries in a small amount of time to avoid to inadvertently launch a
DOS (Denial of Service) attack.
* Installation
This program works with both Python 2 and 3. If the entries
contain Unicode characters, Python 3 is recommended.
* Usage
Let's assume that the file =refs.tex= contains a list of arXiv
numbers. For instance, this could be a list of =bibitem='s entries:
#+BEGIN_EXAMPLE
\bibitem{Aad:2012tfa}
G.~Aad {\it et al.} [ATLAS Collaboration],
%``Observation of a new particle in the search for the Standard Model Higgs boson with the ATLAS detector at the LHC,''
Phys.\ Lett.\ B {\bf 716} (2012) 1
doi:10.1016/j.physletb.2012.08.020
[arXiv:1207.7214 [hep-ex]].
\bibitem{Ade:2015xua}
P.~A.~R.~Ade {\it et al.} [Planck Collaboration],
%``Planck 2015 results. XIII. Cosmological parameters,''
Astron.\ Astrophys.\ {\bf 594} (2016) A13
doi:10.1051/0004-6361/201525830
[arXiv:1502.01589 [astro-ph.CO]].
#+END_EXAMPLE
(The format of the file is not relevant, only the arXiv numbers are
parsed. In this case =arXiv:1207.7214= and =arXiv:1502.01589=.)
Fetch the corresponding bibtex entries and print them on terminal:
#+BEGIN_SRC shell
arxiv2bib refs.tex
#+END_SRC
* Acknowledgments
The program depends on =pyinspire=, distributed together with this
package (no need to install it separately).
* Development
Checkout the repository. Run the code as:
#+BEGIN_SRC python
python3 -m arxiv2bib.arxiv2bib *args*
#+END_SRC
# Copyright (C) 2017 Francesco Montanari
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
"""arxiv2bib: Parse arxiv numbers and download the respective bibtex entries
Provide functions to parse arxiv numbers text files. Query inspire.net
to print to stdout the bibtex entries corresponding to the respective
arxiv numbers. All the results are printed to stdout.
.. note::
This program works with both Python 2 and 3. If the input file
contains Unicode characters, Python 3 is recommended.
"""
import re
import sys
from .pyinspire.pyinspire import get_text_from_inspire
def get_arxiv_ids(string, prefix='arxiv:'):
"""Read string and return a list containing arxiv numbers. Match
patterns such as `arXiv:1234.5678` or `arXiv:gr-qc/1234567`.
"""
regexp = r'[A-Za-z0-9.\-\/]*'
return re.findall(prefix+regexp, string, re.IGNORECASE)
def get_bibtex(myfile):
"""Print to stdout the inspire query result based on arxiv numbers
listed in myfile.
"""
with open(myfile, 'r') as f:
string = f.read()
#string = string.encode('utf-8')
prefix = 'arxiv:'
arxiv_ids = set(get_arxiv_ids(string, prefix=prefix)) # Remove duplicates
base = 'find eprint '
resultformat = 'bibtex'
tags = None
for arxiv in sorted(arxiv_ids):
# Remove the starting 'arxiv:'. Since the case may be mixed
# (e.g., arxiv, arXiv or ARXIV), use list slices.
result = get_text_from_inspire(search=base+arxiv[len(prefix):],
resultformat=resultformat,
ot=tags)
print(result)
print("%%% arxiv2bib retrieved {} references.".format(len(arxiv_ids)))
def get_cli():
"""Command line interface"""
if len(sys.argv)<2:
print("Usage: arxiv2bib FILE")
else:
myfile = sys.argv[1]
get_bibtex(myfile)
if __name__=='__main__':
get_cli()
# Copyright (C) 2017 Francesco Montanari
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
"""arxiv2bib: Parse arxiv numbers and download the respective bibtex entries
Provide functions to parse arxiv numbers text files. Query inspire.net
to print to stdout the bibtex entries corresponding to the respective
arxiv numbers. All the results are printed to stdout.
.. note::
This program works with both Python 2 and 3. If the input file
contains Unicode characters, Python 3 is recommended.
"""
import re
import sys
from .pyinspire.pyinspire import get_text_from_inspire
def get_arxiv_ids(string, prefix='arxiv:'):
"""Read string and return a list containing arxiv numbers. Match
patterns such as `arXiv:1234.5678` or `arXiv:gr-qc/1234567`.
"""
regexp = r'[A-Za-z0-9.\-\/]*'
return re.findall(prefix+regexp, string, re.IGNORECASE)
def get_bibtex(myfile):
"""Print to stdout the inspire query result based on arxiv numbers
listed in myfile.
"""
with open(myfile, 'r') as f:
string = f.read()
#string = string.encode('utf-8')
prefix = 'arxiv:'
arxiv_ids = set(get_arxiv_ids(string, prefix=prefix)) # Remove duplicates
base = 'find eprint '
resultformat = 'bibtex'
tags = None
for arxiv in sorted(arxiv_ids):
# Remove the starting 'arxiv:'. Since the case may be mixed
# (e.g., arxiv, arXiv or ARXIV), use list slices.
result = get_text_from_inspire(search=base+arxiv[len(prefix):],
resultformat=resultformat,
ot=tags)
print(result)
print("%%% arxiv2bib retrieved {} references.".format(len(arxiv_ids)))
if __name__=='__main__':
if len(sys.argv)<1:
print("Usage: arxiv2bib FILE")
else:
FILE = sys.argv[1]
get_bibtex(FILE)
Copyright (c) 2012-2015, Ian Huston
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of pyinspire nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL IAN HUSTON
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
DAMAGE.
=========
Pyinspire
=========
Retrieve results from the INSPIRE HEP database (http://inspirehep.net) from the
command line.
Author: Ian Huston
Contributors: David Straub
Released under the modified BSD license.
Usage: pyinspire.py [options]
Example: pyinspire.py -b -s "find a Feynman, Richard"
Options:
--version show program's version number and exit
-h, --help show this help message and exit
-s STRING, --search=STRING search string to send to INSPIRE
-b, --bibtex output bibtex for entries
--latexEU use LaTeX(EU) format for entries
--latexUS output bibtex for entries
-v, --verbose print informative messages
--debug log lots of debugging information
Tested with Python 2.7 and 3.4.
Please be careful to not overload the INSPIRE server by repeatedly requesting
large numbers of results. If using pyinspire in a script please add some
throttling of frequency of search queries.
Note: Currently only the first 100 results are returned.
#!/usr/bin/env python
'''
pyinspire - command line retrieval of INSPIRE HEP database results
Author: Ian Huston
Released under the modified BSD license.
Example: pyinspire.py -b -s "find a Feynman, Richard"
'''
import sys
try:
from urllib.request import urlopen
except ImportError:
from urllib import urlopen
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
from bs4 import BeautifulSoup
import optparse
import logging
import re
__version__ = "0.3.0"
APIURL = "http://inspirehep.net/search?"
logging.basicConfig()
log = logging.getLogger("pyinspire")
def get_text_from_inspire(search="", resultformat="brief", ot=None):
"""Extract text from an INSPIRE search."""
log.info("Search of INSPIRE started...")
data = query_inspire(search, resultformat=resultformat, ot=ot)
if resultformat == "marcxml" or resultformat == "json":
text = data.decode("utf-8")
else:
text = extract_from_data(data)
return text
def inspire_url(search="", resultformat="brief", startrecord=0, ot=None):
"""Construct the query string for INSPIRE"""
formats = {"brief": "hb",
"bibtex": "hx",
"latexEU": "hlxe",
"latexUS": "hlxu",
"marcxml": "xm",
"json": "recjson"}
inspireoptions = dict(action_search="Search",
rg=100, #number of results to return in one page
of=formats[resultformat], # format of results
ln="en", #language
p=search, # search string
jrec=startrecord, # record number to start at
)
if resultformat == "marcxml" or resultformat == "json":
if ot is None and resultformat == "marcxml":
ot = '100,700,245' # default: return authors & title only
if ot is None and resultformat == "json":
ot = 'authors,title' # default: return authors & title only
inspireoptions['ot'] = ot
url = APIURL + urlencode(inspireoptions)
return url
def query_inspire(search="", resultformat="brief", ot=None):
"""Query the INSPIRE HEP database and return the entries.
Parameters
----------
search : string
search string to use in query
resultformat : string
long hand name of format, ["brief", "bibtex", "latexEU", "latexUS"]
"""
url = inspire_url(search, resultformat, ot=ot)
log.debug("Query URL is %s", str(url))
try:
f = urlopen(url)
log.debug("Starting to read data from %s.", str(url))
data = f.read()
log.debug("Data has been read: \n %s", str(data))
except IOError as e:
log.error("Error retrieving results: %s", str(e))
raise
return data
def extract_from_data(data):
soup = BeautifulSoup(data, "lxml")
if soup.pre:
text = extract_pre_tags(soup)
else:
text = extract_text(soup)
return text
def extract_pre_tags(soup):
"""Extract text from <pre> tags in BeautifulSoup soup."""
text = "\n".join([tag.text for tag in soup.find_all("pre")])
return text
def citecount(text):
"""Return the number of citations as specified by the
string 'Cited by X records'"""
match = re.search(r"Cited by (\d*) records", text)
if match:
cites = int(match.groups()[0])
else:
cites = 0
return cites
def extract_text(soup):
"""Extract useful text from BeautifulSoup soup"""
mainbodies = soup.find_all("div", {"class":"record_body"})
moreinfos = soup.find_all("div", {"class":"moreinfo"})
if len(mainbodies) != len(moreinfos):
raise ValueError("Number of records is inconsistent.")
if len(mainbodies) == 0:
log.info("No useful information found in text.")
return ""
[t.small.ul.replaceWith("") for t in mainbodies if t.small.ul]
moreinfotext = [mi.text.replace("Detailed record - ", "") for mi in moreinfos]
ts = [mb.text + "\n" + mi for mb, mi in zip(mainbodies, moreinfotext) ]
text = ("\n"+40*"="+"\n").join(ts).replace("\n\n", "\n")
return text
def extract_details(soup):
"""Extract title, authors, arxivnum and citation count"""
mainbodies = soup.find_all("div", {"class":"record_body"})
moreinfos = soup.find_all("div", {"class":"moreinfo"})
if len(mainbodies) != len(moreinfos):
raise ValueError("Number of records is inconsistent.")
if len(mainbodies) == 0:
log.info("No useful information found in text.")
return []
details = []
for mb, mi in zip(mainbodies, moreinfos):
d = {"title": mb.findAll("a", {"class":"titlelink"})[0].text,
"authors": [m.text for m in mb.findAll("a", {"class":"authorlink"})],
"citations": citecount(mi.text)}
details.append(d)
return details
def main(argv=None):