Commit f2a92000 authored by everet's avatar everet

Tidy up code and docs. Move TODO and CHANGELOG to dedicated files.

parent 1bedd6ef
.directory
notes.md
# Byte-compiled / optimized / DLL files
__pycache__/
......
# NEWS
2020.3.6.9002
* Scraper now removes duplicate ads and avoids unnecessary requests. `post_id`
is now the primary key in `CLSearch.data`.
2020.3.6.1
* Scraper now gets ads' post IDs from ad URLs. Before, a deep scrape was
required to get post IDs.
* Data columns are rearranged so `post_id` and `datetime_scr` appear first.
* `datatime_scr` now contains seconds, so it will differ across pages if
`deep=False` or across ads if `deep=True`.
2020.2.23.1
* First release.
......@@ -22,7 +22,7 @@ pip install git+https://gitlab.com/everetr/craigapts.git
from craigapts import CLSearch
GEO = "newjersey"
QUERY = "'no section 8'"
QUERY = '"no section 8"' # exact phrases must be within "double quotes"
# get basic data available on search result pages
c1 = CLSearch(GEO, QUERY)
......@@ -33,29 +33,10 @@ c2 = CLSearch(GEO, QUERY, deep=True)
print(c2.data)
```
## Changelog
## CHANGELOG
2020.3.6.9001
* Scraper now removes duplicate ads and avoids unnecessary requests. `post_id`
is now the primary key in `CLSearch.data`.
2020.3.6.1
* Scraper now gets ads' post IDs from ad URLs. Before, a deep scrape was
required to get post IDs.
* Data columns are rearranged so `post_id` and `datetime_scr` appear first.
* `datatime_scr` now contains seconds, so it will differ across pages if
`deep=False` or across ads if `deep=True`.
2020.2.23.1
* First release.
See [NEWS.md](NEWS.md).
## TODO
* Replace `requests` dependency with `urllib3`? Because minimalism.
* Let user specify which variables, how many pages, and how many ads to scrape
* CLI
See [TODO.md](TODO.md).
# TODO
* single quotes don't work in CL searches. Must use double quotes only. Is there
a way to clean input so single quotes are treated as double quotes? Or perhaps
add a `exact_phrase` boolean option?
* Let user specify ALL possible CL search options: Price range, ZIP code, beds,
baths, etc. Don't forget: Availability and Open House (sale_date) dates.
* Let user specify which variables, how many pages, and how many ads to scrape.
* Decide: Should timestamps reflect when each ad was scraped, or when the search
began? The former differs across *pages* when `deep=False` and differs across
*ads* when `deep=True`. But `post_id` already uniquely IDs ads and I don't need
an identifer for pages. If I change timestamps to reflect search begin time,
they'll differ across search objects. Is that desirable?
* Replace `requests` dependency with `urllib3`? Because minimalism.
* CLI
......@@ -11,7 +11,7 @@ from random import uniform
from re import findall
from requests import get
from requests.exceptions import RequestException
from sys import exit
import sys
from time import sleep
......@@ -33,7 +33,8 @@ class CLSearch:
available geographies and their Craigslist aliases here:
<link>
`query`: str, default ""
Words for text search. Put exact phrases in quotes.
Words for text search. Exact phrases must be surrounded by
"double" quotes. 'Single' quotes will not work.
`deep`: bool, default False
Navigate to each individual ad and scrape the following additional
variables: Address, bathroom count, attributes (pet-friendly, etc),
......@@ -44,7 +45,7 @@ class CLSearch:
memory footprint.
Example:
c1 = CLSearch(geo="sfbay", query="'no section 8'")
c1 = CLSearch(geo="newjersey", query='"no section 8"')
print(c1.data)
"""
......@@ -63,12 +64,25 @@ class CLSearch:
self.__post_ids = set()
self.data = []
self.next_page_url = self.__build_url(
f"/search/apa?query={query}&bundleDuplicates=1")
# FIXME
# Allow user to specify availability & open house dates. For now
# they're hard-coded to include all dates.
# FIXME
# Really should allow user to specify all CL search options:
# Price range, ZIP code, beds, baths, etc.
("/search/apa{}&bundleDuplicates=1&availabilityMode=0"
"&sale_date=all+dates").format(
# NOTE: "?query" _must_ come after "apa" & begin w/ "?"
f"?query={query}" if query else ""
)
)
# start scraper
self.__scrape_all_pages()
# scraping methods
########################
### Scraping Methods ###
########################
def __scrape_all_pages(self):
"""Drive the scraper.
......@@ -80,7 +94,7 @@ class CLSearch:
self.__goto_next_page()
self.__scrape_page()
n += 1
print(f"Finished scraping {n} pages of results.")
print(f"Finished scraping {n} pages of results.\n")
self.__clean_data()
def __scrape_page(self):
......@@ -192,7 +206,9 @@ class CLSearch:
info = [None if i == "" else i for i in info]
return info or [None]
# navigation methods
##########################
### Navigation Methods ###
##########################
def __goto_next_page(self):
"""Go to next results page."""
......@@ -209,7 +225,7 @@ class CLSearch:
try:
r = get(self.url)
except RequestException as e:
exit(f"Error when requesting {self.url} : {str(e)}")
sys.exit(f"Error when requesting {self.url} : {str(e)}")
else:
print(f"Parsing {self.url}\n")
self.reqc = r.content
......@@ -220,7 +236,9 @@ class CLSearch:
"""Build URL for search results from template."""
return f"https://{self.geo}.craigslist.org{suffix}"
# data methods
#####################
### Other Methods ###
#####################
def __clean_data(self):
"""Combine and clean scraped data."""
......
......@@ -18,7 +18,7 @@ URL = 'https://gitlab.com/everetr/craigapts'
EMAIL = ''
AUTHOR = 'Everet Rummel'
REQUIRES_PYTHON = '>=3.7.0'
VERSION = '2020.3.6.9001'
VERSION = '2020.3.6.9002'
# What packages are required for this module to be executed?
REQUIRED = [
......
......@@ -6,6 +6,6 @@ sys.path.append("/PATH/TO/craigapts/craigapts")
import pandas as pd
from search import CLSearch
nnj_ns8 = CLSearch(geo="newjersey", query='"no section 8"')
nnj_ns8 = CLSearch(geo="newjersey", query='"bedroom"')
df = nnj_ns8.data
any(df.duplicated("post_id"))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment