Pdf.php 1.95 KB
Newer Older
Serhey's avatar
Initial  
Serhey committed
1 2 3 4 5 6 7
<?php

declare(strict_types=1);

namespace ContextualCode\EzPlatformSearchBinaryExtractorBundle\FieldType\BinaryFile\BinaryExtractor;

use ContextualCode\EzPlatformSearchBinaryExtractorBundle\FieldType\BinaryFile\BinaryExtractor;
8
use eZ\Publish\Core\IO\IOServiceInterface;
Serhey's avatar
Initial  
Serhey committed
9 10 11 12 13 14 15 16
use eZ\Publish\Core\IO\Values\BinaryFile;
use eZ\Publish\SPI\Persistence\Content\Field;
use Symfony\Component\Process\Process;

class Pdf implements BinaryExtractor
{
    protected const SUPPORTED_MIMETYPES = ['application/pdf'];

17
    /** @var IOServiceInterface */
Serhey's avatar
Initial  
Serhey committed
18 19
    private $ioService;

20
    public function __construct(IOServiceInterface $ioService)
Serhey's avatar
Initial  
Serhey committed
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
    {
        $this->ioService = $ioService;
    }

    public function supports(Field $field): bool
    {
        return in_array($field->value->externalData['mimeType'], self::SUPPORTED_MIMETYPES, true);
    }

    public function extract(Field $field): ?string
    {
        $file = $this->ioService->loadBinaryFile($field->value->externalData['id']);
        if ($file instanceof BinaryFile === false) {
            return null;
        }

        if (!$this->ioService->exists($file->id)) {
            return null;
        }

        // DFS/AWS binary data handler might be in use, so we need to file contnet into the local file
        $tmpFile = tmpfile();
        fwrite($tmpFile, $this->ioService->getFileContents($file));
        $tmpMetadata = stream_get_meta_data($tmpFile);
        $text = $this->pdfToText($tmpMetadata['uri']);
        fclose($tmpFile);

        return $text;
    }

    protected function pdfToText(string $filepath): ?string
    {
Serhey's avatar
Serhey committed
53
        $process = new Process(['bin/pdftotext', '-nopgbrk', '-q', $filepath, '-']);
Serhey's avatar
Initial  
Serhey committed
54 55
        $process->run();

Serhey's avatar
Serhey committed
56 57 58 59 60 61 62
        $text = trim($process->getOutput());
        return $this->filterString($text);
    }

    protected function filterString(string $string): string{
        $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8');
        return preg_replace('/[^[:print:]\n]/u', '', $string);
Serhey's avatar
Initial  
Serhey committed
63 64
    }
}