Commit e4243a0b authored by Francesco Montanari's avatar Francesco Montanari

Launch one query for all entries

* arxiv2bib/arxiv2bib.py: Remove the for loop over all arxiv
  numbers. Instead, merge them in a single query to inspire. This as
  two advantages. First, there is minor danger to overload inspire
  servers. Second, bibtex retrieval is much faster.

* README: Update according to the change above.

* setup.py: Require `future` module to be installed. It has been
  introduced for compatibility with both Python 2 and 3.
parent 709aaea6
......@@ -17,12 +17,12 @@ General Public License for more details.
Parse [[https://arxiv.org/][arXiv]] numbers from a text file. Fetch the respective bibtex
entries from [[https://inspirehep.net/][inspire]] and print them to standard output.
/Note:/ Queries are launched sequentially to avoid to inadvertently
launch a DOS (Denial of Service) attack. Do not fetch several
entries in a small amount of time.
* Installation
This program works with both Python 2 and 3. The =future= module
must be installed for Python 2. If the entries contain Unicode
characters, Python 3 is recommended.
Clone the git repository and type:
#+BEGIN_SRC shell
......@@ -31,8 +31,6 @@ General Public License for more details.
python3 setup.py install --user
#+END_SRC
This program works with both Python 2 and 3. If the entries
contain Unicode characters, Python 3 is recommended.
* Usage
......@@ -44,6 +42,9 @@ General Public License for more details.
arxiv2bib FILE
#+END_SRC
/Note:/ Do not launch the program several times in parallel to avoid
to inadvertently launch a DOS (Denial of Service) attack.
** Example
Let's assume that =refs.tex= contains a list of =bibitem='s entries:
......
......@@ -23,6 +23,7 @@ arxiv numbers. All the results are printed to stdout.
"""
from builtins import input
import re
import sys
......@@ -34,7 +35,21 @@ def get_arxiv_ids(string, prefix='arxiv:'):
patterns such as `arXiv:1234.5678` or `arXiv:gr-qc/1234567`.
"""
regexp = r'[A-Za-z0-9.\-\/]*'
return re.findall(prefix+regexp, string, re.IGNORECASE)
arxiv_ids = re.findall(prefix+regexp, string, re.IGNORECASE)
# Use slices to remove the prefix (it could appear in different
# case combinations).
return [arxiv_id[len(prefix):] for arxiv_id in arxiv_ids]
def _warn_size(size, threshold=1):
"""Warn if list length is above the threshold. Ask to exit."""
if size > threshold:
print("Large reference list ({} items).".format(size))
ans = input('Continue? [n]/y: ')
assert isinstance(ans, str) # Native str on Py2 and Py3.
if (not ans.startswith('y')) and (not ans.startswith('Y')):
exit()
def get_bibtex(myfile):
......@@ -44,19 +59,22 @@ def get_bibtex(myfile):
"""
with open(myfile, 'r') as f:
string = f.read()
#string = string.encode('utf-8')
prefix = 'arxiv:'
arxiv_ids = set(get_arxiv_ids(string, prefix=prefix)) # Remove duplicates
arxiv_ids = get_arxiv_ids(string, prefix=prefix)
arxiv_ids = set(arxiv_ids) # Remove duplicates
size = len(arxiv_ids)
_warn_size(size)
base = 'find eprint '
search = base + " or eprint ".join(sorted(arxiv_ids))
resultformat = 'bibtex'
tags = None
for arxiv in sorted(arxiv_ids):
# Remove the starting 'arxiv:'. Since the case may be mixed
# (e.g., arxiv, arXiv or ARXIV), use list slices.
result = get_text_from_inspire(search=base+arxiv[len(prefix):],
if size > 0:
result = get_text_from_inspire(search=search,
resultformat=resultformat,
ot=tags)
print(result)
......
......@@ -24,4 +24,5 @@ setup(name='arxiv2bib',
packages=find_packages(),
entry_points={'console_scripts':
['arxiv2bib = arxiv2bib.arxiv2bib:get_cli',]},
install_requires=['future']
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment