Assistance with Latest Calendar Retrieval on dre.pt Scraper

Hello @hgg, I hope this message finds you well. First, I want to thank you for sharing your dre.pt application repository—it has been very helpful for my work. I’m currently developing a scraper for this application in PHP, but I’m encountering an issue where I can only retrieve calendar data from 2021 and 2020, and I’m unable to get the most recent ones. Could you please advise on how I might resolve this? Is this a common issue or could there be something specific that I’m missing? This is my test code now:

``<?php // Función para obtener el token CSRF y las cookies function getCSRFTokenAndCookies() { $cookies_url = 'https://diariodarepublica.pt/dr/moduleservices/moduleversioninfo'; $csrf_url = 'https://diariodarepublica.pt/dr/scripts/OutSystems.js';

// Iniciar una sesión CURL para obtener las cookies
$ch = curl_init($cookies_url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true); // Incluir encabezados en la respuesta
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // Seguir redirecciones
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // Ignorar verificación SSL
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);

$response = curl_exec($ch);
if ($response === false) {
    die("Error al obtener las cookies: " . curl_error($ch));
}
curl_close($ch);

// Extraer cookies
preg_match_all('/Set-Cookie: (.*?);/m', $response, $cookies);
$cookieHeader = implode("; ", $cookies[1]);

// Obtener el token CSRF
$ch = curl_init($csrf_url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);

$csrfResponse = curl_exec($ch);
if ($csrfResponse === false) {
    die("Error al obtener el token CSRF: " . curl_error($ch));
}
curl_close($ch);

// Buscar el token CSRF en el contenido de la página
preg_match('/AnonymousCSRFToken="(.*?)"/', $csrfResponse, $matches);
if (empty($matches[1])) {
    die("Error: No se pudo encontrar el token CSRF.");
}
$csrfToken = $matches[1];

return ['csrfToken' => $csrfToken, 'cookies' => $cookieHeader];

}

// Función para hacer una solicitud POST con CSRF y cookies function makeRequest($csrfToken, $cookies, $payload) { $url = "https://diariodarepublica.pt/dr/screenservices/dr/Home/home/DataActionGetDRByDataCalendario";

// Inicializar la solicitud CURL
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
    "Content-Type: application/json; charset=UTF-8",
    "X-CSRFToken: $csrfToken",
    "Cookie: $cookies"
]);
curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);

// Ignorar la verificación del certificado SSL
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);

// Ejecutar la solicitud y obtener la respuesta
$response = curl_exec($ch);

// Verificar si hubo algún error en la solicitud
if ($response === false) {
    $error = curl_error($ch);
    curl_close($ch);
    die("Error en la solicitud CURL: $error");
}

curl_close($ch);
return json_decode($response, true); // Decodificar la respuesta JSON

}

// Obtener token CSRF y cookies $tokens = getCSRFTokenAndCookies(); $csrfToken = $tokens['csrfToken']; $cookies = $tokens['cookies'];

// Crear el payload correctamente usando un array y json_encode para evitar problemas de formato $payload = json_encode([ "versionInfo" => [ "moduleVersion" => "vPN0ZropVOcojIMGYEboXg", "apiVersion" => "A00rktBtkSvxDLsFy+6mgg" ], "viewName" => "Home.home", "screenData" => [ "variables" => [ "ContagemLegConsolidada" => "", "ContagemJurisprudencia" => "", "UtilizadorGestorDeConteudo" => false, "HasSerie1" => true, "HasSerie2" => true, "DataUltimaPublicacao" => "2024-10-21", "IsRendered" => true, "IsMobile" => "", "ContagemLexionario" => "", "SerieI" => false, "_serieIInDataFetchStatus" => 1, "DataCalendario" => "2024-10-21", "_dataCalendarioInDataFetchStatus" => 1 ] ], "clientVariables" => [ "NewUser" => "https://dre.pt/dre/utilizador/registar", "PesquisaAvancada" => "https://dre.pt/dre/pesquisa-avancada", "NIC" => "", "UtilizadorPortalIdOld" => "0", "Login" => "https://dre.pt/dre/utilizador/entrar", "TotalResultados" => 0, "Search" => false, "DicionarioJuridicoId" => "0", "FullHTMLURL_EN" => "https://dre.pt/dre/en", "Name" => "", "ShowResult" => false, "EntityId_Filter" => 0, "BookId_Filter" => 0, "Email" => "", "StartIndex" => 0, "paginaJson" => "", "Pesquisa" => "lei 5/96", "CookiePath" => "/dre/", "DataInicial_Filter" => "2024-01-01", "DataFinal_Filter" => "2024-12-31", "DiarioRepublicaId" => "", "Query_Filter" => "", "UtilizadorPortalId" => "0", "t" => "", "Session_GUID" => "cd330488-e5cd-45af-abee-08c61ef3e3c0", "ActoLegislativoId_Filter" => 0, "FullHTMLURL" => "https://dre.pt/dre/home", "TipoDeUtilizador" => "", "GUID" => "cf2169ce-5ccf-4615-a0ee-676206dab74f", "IsColecaoLegislacaoFilter" => true ] ]);

// Realizar la solicitud response = makeRequest(csrfToken, $cookies, $payload);

// Verificar si la solicitud devolvió hits if (isset($response['data']['Json_Out'])) { jsonOut = json_decode(response['data']['Json_Out'], true); if (isset(jsonOut['hits']['hits']) && count(jsonOut['hits']['hits']) > 0) { // Recorrer los documentos obtenidos foreach ($jsonOut['hits']['hits'] as $hit) { $document = $hit['_source']; echo "Documento: " . $document['conteudoTitle'] . "\n"; echo "Fecha de publicación: " . $document['dataPublicacao'] . "\n"; echo "-------------------------\n"; } } else { echo "No se encontraron documentos.\n"; } } else { echo "Error: Respuesta inesperada.\n"; }

// Guardar la respuesta completa en un archivo JSON para depuración responseFormatted = json_encode(response, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE); file_put_contents('response_pretty.json', $responseFormatted); echo "Respuesta guardada en response_pretty.json\n";

And i get this response: { "versionInfo": { "hasModuleVersionChanged": false, "hasApiVersionChanged": false }, "data": { "Json_Out": "{"took":11,"timed_out":false,"_shards":{"total":3,"successful":3,"skipped":0,"failed":0},"hits":{"total":{"value":10000,"relation":"gte"},"max_score":null,"hits":[{"_index":"dre-prod-22082021","_id":"162314520_DiarioRepublica","_score":null,"_source":{"dbId":162314520,"conteudoTitle":"Diário da República n.º 83/2021, Série II de 2021-04-29","dataPublicacao":"2021-04-29","numero":"83"},"sort":[0]},{"_index":"dre-prod-22082021","_id":"162237947_DiarioRepublica","_score":null,"_source":{"dbId":162237947,"conteudoTitle":"Diário da República n.º 82/2021, Série II de 2021-04-28","dataPublicacao":"2021-04-28","numero":"82"},"sort":[0]},{"_index":"dre-prod-22082021","_id":"138755780_DiarioRepublica","_score":null,"_source":{"dbId":138755780,"conteudoTitle":"Diário da República n.º 143/2020, Série I de 2020-07-24","dataPublicacao":"2020-07-24","numero":"143"},"sort":[0]},{"_index":"dre-prod-22082021","_id":"143189677_DiarioRepublica","_score":null,"_source":{"dbId":143189677,"conteudoTitle":"Diário da República n.º 183/2020, Série I de 2020-09-18","dataPublicacao":"2020-09-18","numero":"183"},"sort":[0]},{"_index":"dre-prod-22082021","_id":"138348117_DiarioRepublica","_score":null,"_source":{"dbId":138348117,"conteudoTitle":"Diário da República n.º 139/2020, Série II de 2020-07-20","dataPublicacao":"2020-07-20","numero":"139"},"sort":[0]},{"_index":"dre-prod-22082021","_id":"138382359_DiarioRepublica","_score":null,"_source":{"dbId":138382359,"conteudoTitle":"Diário da República n.º 140/2020, Série II de 2020-07-21","dataPublicacao":"2020-07-21","numero":"140"},"sort":[0]},{"_index":"dre-prod-22082021","_id":"134918010_DiarioRepublica","_score":null,"_source":{"dbId":134918010,"conteudoTitle":"Diário da República n.º 107/2020, Série II de 2020-06-02","dataPublicacao":"2020-06-02","numero":"107"},"sort":[0]},{"_index":"dre-prod-22082021","_id":"159539872_DiarioRepublica","_score":null,"_source":{"dbId":159539872,"conteudoTitle":"Diário da República n.º 53/2021, Série II de 2021-03-17","dataPublicacao":"2021-03-17","numero":"53"},"sort":[0]},{"_index":"dre-prod-22082021","_id":"134392711_DiarioRepublica","_score":null,"_source":{"dbId":134392711,"conteudoTitle":"Diário da República n.º 101/2020, Série II de 2020-05-25","dataPublicacao":"2020-05-25","numero":"101"},"sort":[0]},{"_index":"dre-prod-22082021","_id":"134505596_DiarioRepublica","_score":null,"_source":{"dbId":134505596,"conteudoTitle":"Diário da República n.º 102/2020, Série I de 2020-05-26","dataPublicacao":"2020-05-26","numero":"102"},"sort":[0]}]},"aggregations":{"SerieAgg":{"doc_count_error_upper_bound":0,"sum_other_doc_count":0,"buckets":[{"key":"I","doc_count":41742},{"key":"III","doc_count":33985},{"key":"II","doc_count":25237}]}}}" }, "rolesInfo": "," }

I appreciate your time and any guidance you can provide to me because im so lost now… Best regards, Luis

Edited by luisterron