rank_terms.py 980 Bytes
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
#!/usr/bin/env python
#
# rank_terms.py - rank index terms by frequency

import sys
import xapian

from operator import itemgetter

if __name__ == '__main__':
    if "-h" in sys.argv or not len(sys.argv) == 4:
        print "\nUsage: rank_terms.py INDEX TERMS_FILE PREFIX\n"
    else:
        try:
            index = xapian.Database(sys.argv[1])
        except:
            print "Could no open xapian index at %s" % sys.argv[1]
        try:
            with open(sys.argv[2]) as terms_file:
                terms_list = [line.strip() for line in terms_file]
                print terms_list
                frequencies = {}
                for term in terms_list:
                    frequencies[term] = index.get_termfreq(sys.argv[3] + term)
            sorted_freqs = sorted(frequencies.items(), key=itemgetter(1))
        except:
            print "Could not extract terms list from %s" % sys.argv[2]
        for term, freq in sorted_freqs:
            print term, str(freq)