chore: move divi folder from datasets repo

parent 007d9430
FROM python:3.8.2
RUN apt-get update \
&& apt-get install -y cron \
&& apt-get autoremove -y
COPY ./scrape_divi_data.py /opt/scrape_divi_data.py
COPY ./run_divi_scraper.sh /opt/run_divi_scraper.sh
RUN chmod 0755 /opt/run_divi_scraper.sh
COPY ./requirements.txt /opt/requirements.txt
RUN pip3 install -r /opt/requirements.txt
ENTRYPOINT [ "/opt/run_divi_scraper.sh" ]
\ No newline at end of file
*/20 * * * * docker run --rm divi-data-scraper:0.0.1
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
version: '3.1'
networks:
app-divi:
driver: bridge
services:
mongo:
image: mongo
ports:
- 27017:27017
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
networks:
- app-divi
volumes:
- ./data/mongo/divi:/data/db
divi:
image: 'divi-data-scraper:0.0.1'
networks:
- app-divi
\ No newline at end of file
pymongo
beautifulsoup4
requests
\ No newline at end of file
#!/bin/sh
python3 /opt/scrape_divi_data.py
\ No newline at end of file
from bs4 import BeautifulSoup
import requests
import re
import json
import datetime
from datetime import datetime
from random import randint
from pymongo import MongoClient
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("-j", "--json-file", action='store_true', required=False, help="Wether the data should be saved as a json file or saved to mongo db")
ap.add_argument("-g", "--geo-features", action='store_true', required=False, help="Does nothing if --json-file is not active, transforms the json to geo feature data otherwise")
args = ap.parse_args()
def get_status_from_string(class_name):
parts = class_name.split("-")
if "green" in parts:
return 1
elif "yellow" in parts:
return 2
elif "red" in parts:
return 3
elif "unavailable" in parts:
return 0
else:
return 99
print(datetime.now().strftime("%Y%m%d-%H%M%S"), ": Scraper started")
form_param = {
'filter[search]': None,
'list[fullordering]': 'a.title ASC',
'list[limit]': '0',
'filter[federalstate]': '0',
'filter[chronosort]': '0',
'filter[icu_highcare_state]': None,
'filter[ecmo_state]': None,
'filter[ards_network]': None,
'limitstart': '0',
'task': None,
'boxchecked': '0'
# &a9e6dfb28837bdec78b9424f19a3a8f0=1
}
res = requests.post(f'https://www.divi.de/register/intensivregister?view=items', data=form_param)
soup = BeautifulSoup(res.content, features='html.parser')
tr = soup.find_all('tr', class_=["row0", "row1"])
out = []
for row in tr:
# initialize empty row object
j_obj = {}
cols = row.find_all('td')
### parse first column
col1 = cols[0].getText().split('\n')[1:] # Hospital Name, "Department" (not included in some rows), Address
hospitalName = col1[0].strip()
department = ""
col1Index = 1
if len(col1) == 5:
department = col1[col1Index].strip()
else:
col1Index = col1Index - 1
street = " ".join(col1[col1Index + 1].strip().split(' ')[:-1]).strip()
streetNr = " ".join(col1[col1Index + 1].strip().split(' ')[-1:]).strip()
zipCode = col1[col1Index + 2].strip().split(' ')[0]
city = " ".join(col1[col1Index + 2].strip().split(' ')[1:])
### parse second column
col2TextList = cols[1].getText().split()
contactUrl = ""
if "Website" in col2TextList:
col2TextList.remove("Website")
contactUrl = cols[1].find("a", href=True)['href']
if contactUrl[0] == "/":
if "@" in contactUrl:
contactUrl = "mailto:" + contactUrl[1:]
else:
contactUrl = "http:/" + contactUrl
else:
if "@" in contactUrl:
contactUrl = "mailto:" + contactUrl
contactText = " ".join(col2TextList)
### parse third column
state = " ".join(cols[2].getText().split())
### parse fourth column
icuLowCare = get_status_from_string(cols[3].find('span')['class'][0])
### parse fifth column
icuHighCare = get_status_from_string(cols[4].find('span')['class'][0])
### parse sixth column
ecmo = get_status_from_string(cols[5].find('span')['class'][0])
### parse seventh column
lastUpdateAtDateString = " ".join(cols[6].getText().split())
lastUpdateAt = datetime.strptime(lastUpdateAtDateString, '%d.%m.%Y %H:%M').__str__()
# lastUpdateAt = datetime.strptime(lastUpdateAtDateString, '%d.%m.%Y %H:%M')
# get lat and lon
# geoUrl = f'https://nominatim.openstreetmap.org/search?street={street} {streetNr}&city={city}&postalcode={zipCode}&country=Germany&extratags=1&addressdetails=1&format=json'
geoUrl = f'https://nominatim.openstreetmap.org/search?street={street} {streetNr}&city={city}&postalcode={zipCode}&country=Germany&format=json'
geo_res = requests.get(geoUrl)
geo_res_json = json.loads(geo_res.content)
lat = 0.0
lon = 0.0
if len(geo_res_json) != 0:
lat = float(geo_res_json[0]['lat'])
lon = float(geo_res_json[0]['lon'])
else:
geoUrl2 = f'https://nominatim.openstreetmap.org/search?postalcode={zipCode}&country=Germany&format=json'
geo_res2 = requests.get(geoUrl2)
geo_res_json2 = json.loads(geo_res2.content)
if len(geo_res_json2) != 0:
lat = float(geo_res_json2[0]['lat'])
lon = float(geo_res_json2[0]['lon'])
### generate json object
j_obj['name'] = hospitalName
j_obj['department'] = department
addr_obj = {}
addr_obj['street'] = street
addr_obj['streetNr'] = streetNr
addr_obj['zipCode'] = zipCode
addr_obj['city'] = city
addr_obj['state'] = state
addr_obj['lat'] = lat
addr_obj['lon'] = lon
j_obj['address'] = addr_obj
contact_obj = {}
contact_obj['text'] = contactText
contact_obj['url'] = contactUrl
j_obj['contact'] = contact_obj
status_obj = {}
status_obj['icuLowCare'] = icuLowCare
status_obj['icuHighCare'] = icuHighCare
status_obj['ecmo'] = ecmo
j_obj['status'] = status_obj
j_obj['lastUpdate'] = lastUpdateAt
out.append(j_obj)
if args.json_file:
if args.geo_features:
geoJsonObj = {
"type": "FeatureCollection",
"crs": {
"type": "name",
"properties": {
"name": "urn:ogc:def:crs:OGC:1.3:CRS84"
}
}
}
geoFeatures = []
for i, entry in enumerate(out):
feature_obj = {
"type": "Feature",
"properties": {
"id": i+1,
"hospitalName": entry["name"],
"status_icuLowCare": entry["status"]["icuLowCare"],
"status_icuHighCare": entry["status"]["icuHighCare"],
"status_ecmo": entry["status"]["ecmo"]
},
"geometry": {
"type": "Point",
"coordinates": [
entry["address"]["lon"],
entry["address"]["lat"],
0.0
]
}
}
geoFeatures.append(feature_obj)
geoJsonObj["features"] = geoFeatures
fileName = "geo_" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".json"
with open(fileName, "w") as outfile:
json.dump(geoJsonObj, outfile)
else:
fileName = "dump_" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".json"
with open(fileName, "w") as outfile:
json.dump(out, outfile)
else: # not implemented yet. shall be used to store historical data to keep track of the changes
client = MongoClient('mongo', 27017, username='root', password='example')
db = client["divi_db"]
col = db["divi"]
col.insert_many(out)
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment