Commit 5f8fe304 authored by dnppp's avatar dnppp

remplacement du micro-travail par de l'OCR

parent cc37704f
......@@ -34,6 +34,13 @@ import base64
import codecs
import sqlite3
import random
from PIL import Image
import statistics
import subprocess
import unidecode
import re
# Number of votes required
REQUIRED = 4717396
......@@ -50,15 +57,6 @@ DATA_INSEE = "data_insee.raw.sqlite"
NO_SOUTIEN = "Aucun soutien n'est accept"
''' UPDATE 17/06 23:00 -> captchas ajoutes: on a besoin d'utiliser anti captcha.
https://anti-captcha.com
Mettez la cle dans le fichier ANTICAPTCHA_KEY
'''
ANTICAPTCHA_KEY_FILE=KEYS_ROOT+"key.dat"
ANTICAPTCHA_KEY = ""
with open(ANTICAPTCHA_KEY_FILE) as ac_file:
ANTICAPTCHA_KEY = ac_file.readlines()[0].rstrip("\r\n")
''' UPDATE 17/06 20:00 -> google captcha ajoute (je croyais le gvt anti-GAFAM? mais que se passe-t-il?)
une fois le captcha valide, il vous ajoute des cookies, les charger ici.
......@@ -134,47 +132,73 @@ def extract_data_between(data, start, end):
data = data[:data.find(end)]
return data
def check_captcha_task(taskId, sleepTime):
'''Recupere une tache de reconnaissance de captcha'''
result = requests.post("https://api.anti-captcha.com/getTaskResult", json={"clientKey":ANTICAPTCHA_KEY,"taskId":taskId})
try:
jresult = json.loads(result.content)
while jresult["status"] != "ready":
if jresult["status"] == "processing":
time.sleep(sleepTime)
result = requests.post("https://api.anti-captcha.com/getTaskResult", json={"clientKey":ANTICAPTCHA_KEY,"taskId":taskId})
jresult = json.loads(result.content)
NEIB_DISTANCE = 2
STRIP_COLOR = [112, 117]
RE_PATTERN0 = re.compile('Y+\=')
RE_PATTERN1 = re.compile('[\W_]+')
def avg_neib(im, x, y, width, height, distance = NEIB_DISTANCE):
neib_colors = []
for i in range(x - distance, x + distance):
for j in range(y - distance, y + distance):
if i > 0 and i < width and j > 0 and j < height:
neib_rgb = im.getpixel((i,j))
if neib_rgb[0] not in STRIP_COLOR and neib_rgb[1] not in STRIP_COLOR and neib_rgb[2] not in STRIP_COLOR:
neib_colors.append(neib_rgb)
if len(neib_colors) == 0:
if distance < 5:
return avg_neib(im, x, y, width, height, distance + 1)
else:
print('Could not mean pixel (%s, %s)' % (x, y))
return (0, 0, 0)
else:
avg_r = int(statistics.mean([rr[0] for rr in neib_colors]))
avg_g = int(statistics.mean([gg[1] for gg in neib_colors]))
avg_b = int(statistics.mean([bb[2] for bb in neib_colors]))
return (avg_r, avg_g, avg_b)
def captcha_handle_file(path):
filename = os.path.basename(path)
filename = os.path.splitext(filename)[0]
im = Image.open(path)
out = Image.new('RGB', im.size, 0xFFFFFF)
width, height = im.size
for x in range(width):
total = 0
for y in range(height):
r, g, b = im.getpixel((x,y))
if r in STRIP_COLOR and g in STRIP_COLOR and b in STRIP_COLOR:
out.putpixel((x,y), avg_neib(im, x, y, width, height))
else:
raise Exception("error while processing microtask: "+result.content)
except Exception as e:
raise Exception("error while processing microtask: "+result.content+" (exception: "+str(e)+")")
return jresult
def make_captcha_task(task_data):
return requests.post("http://api.anti-captcha.com/createTask", json={"clientKey":ANTICAPTCHA_KEY,"task":task_data})
out.putpixel((x,y), (r, g, b))
out.save('/tmp/%s-improved.png' % filename)
out.close()
convert_cmd0 = "convert /tmp/%s-improved.png -matte \( +clone -fuzz 5%% -transparent \"#8c8c8c\" \) -compose DstOut -composite -background white -flatten -define connected-components:area-threshold=11 -define connected-components:mean-color=true -connected-components 4 -threshold 50%% -background black -flatten /tmp/%s-convert0.png" % (filename, filename)
convert_cmd1 = "convert /tmp/%s-convert0.png -background white -flatten -blur 0x2 /tmp/%s-convert1.png" % (filename, filename)
tesseract_cmd = 'tesseract /tmp/%s-convert1.png stdout --psm 7' % filename
subprocess.call(convert_cmd0, shell = True)
subprocess.call(convert_cmd1, shell = True)
result = subprocess.check_output(tesseract_cmd, shell = True).decode(sys.stdout.encoding).strip()
result = unidecode.unidecode(result)
result = RE_PATTERN0.sub('Y', result)
result = RE_PATTERN1.sub('', result)
os.remove("/tmp/%s-improved.png" % (filename,))
os.remove("/tmp/%s-convert0.png" % (filename,))
os.remove("/tmp/%s-convert1.png" % (filename,))
return result
def get_captcha(image):
'''Lance une tache de reconnaissance de captcha'''
ret = ""
erreur_count = 0
while len(ret) == 0:
try:
task = make_captcha_task({"type":"ImageToTextTask","body":"\""+image+"\"","case":True})
jtask = json.loads(task.content)
if jtask["errorId"] != 0:
raise Exception("could not run microtask: "+task.content)
ret = check_captcha_task(jtask["taskId"], 1)["solution"]["text"]
except Exception as e:
time.sleep(1)
if erreur_count > 5:
print("Le captcha a échoué plus de 10 fois, abandon...")
sys.exit(-1)
erreur_count += 1
print("Erreur en capturant le captcha: "+str(e)+", retryage...")
filename = "/tmp/update."+(''.join(random.choice(string.digits) for _ in range(5)))+".png"
with open(filename, 'wb') as f:
for chunk in image:
f.write(chunk)
ret = captcha_handle_file(filename)
os.remove(filename)
return ret
def isDigit(c):
......@@ -225,9 +249,10 @@ def bypass_captcha(url,idx,I,J):
token = extract_data_between(token, "", "\" />")
print("\tGot captcha token="+token)
image = requests.get("https://www.referendum.interieur.gouv.fr/bundles/ripconsultation/securimage/securimage_show.php", cookies=COOKIES, stream=True)
CAPTCHA_VAL = get_captcha(base64.b64encode(image.content))
CAPTCHA_VAL = get_captcha(image)
print("\t"+str(idx)+","+I+","+J+","+"CAPTCHA="+CAPTCHA_VAL)
time.sleep((250+random.randint(0,250))/1000)
rep = requests.post(current_url, cookies=COOKIES, data={"form[captcha]":CAPTCHA_VAL,"form[_token]":token})
data = trim_eol(rep.text)
except Exception as e:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment