Commit 395000f3 authored by Serhey's avatar Serhey

Initial

parents
/vendor/
composer.lock
.php_cs.cache
./vendor/ezsystems/ezplatform-code-style/.php_cs
\ No newline at end of file
This diff is collapsed.
# eZ Platform Search Binary Extractor
It was possible to use third-party binaries to index binary files in eZ Publish. This functionality is missing in the latest eZ Platform versions. And this bundle provides it.
Also, it provides an example of a binary extractor for PDF files. Which uses [pdftotext](https://www.xpdfreader.com/download.html) third-party binary.
## Installation
1. Require `contextualcode/ezplatform-search-binary-extractor` via composer:
```bash
composer require contextualcode/ezplatform-search-binary-extractor
```
2. Activate the bundle in `app/AppKernel.php`:
```php
$bundles = [
...
new ContextualCode\EzPlatformSearchBinaryExtractorBundle\EzPlatformSearchBinaryExtractorBundle(),
];
```
## Usage
First of all, please double check if "Searchable" checkbox is checked for binary file field types that need to be searchable.
After the bundle is installed, all the PDF files content will be indexed. And you would need to rebuild the search index by running:
```bash
php bin/console ezplatform:reindex
```
Also it is possible to build your own custom binary extractors. You just need to follow a few simple steps:
1. Create a new service which implements [`BinaryExtractor`](https://gitlab.com/contextualcode/ezplatform-search-binary-extractor/-/blob/1.0/src/FieldType/BinaryFile/BinaryExtractor.php). Please use [`BinaryExtractor\Pdf`](https://gitlab.com/contextualcode/ezplatform-search-binary-extractor/-/blob/1.0/src/FieldType/BinaryFile/BinaryExtractor/Pdf.php) as an example.
2. Tag your service with [`ezplatform.field_type.ezbinaryfile.binary_extractor` tag](https://gitlab.com/contextualcode/ezplatform-search-binary-extractor/-/blob/1.0/src/Resources/config/services.yml#L12).
\ No newline at end of file
File added
{
"name": "contextualcode/ezplatform-search-binary-extractor",
"license": "GPL-2.0-only",
"type": "ezplatform-bundle",
"description": "eZ Platform bundle which allows to index the content of binary files",
"homepage": "http://www.contextualcode.com",
"minimum-stability": "stable",
"authors": [
{
"name": "Serhey Dolgushev",
"email": "[email protected]"
}
],
"autoload": {
"psr-4": {
"ContextualCode\\EzPlatformSearchBinaryExtractorBundle\\": "src/"
}
},
"require": {
"ezsystems/ezpublish-kernel": "^7.5"
},
"require-dev": {
"ezsystems/ezplatform-code-style": "^0.1.0"
},
"bin": ["bin/pdftotext"]
}
<?php
declare(strict_types=1);
namespace ContextualCode\EzPlatformSearchBinaryExtractorBundle\DependencyInjection;
use Symfony\Component\Config\FileLocator;
use Symfony\Component\DependencyInjection\ContainerBuilder;
use Symfony\Component\DependencyInjection\Extension\Extension;
use Symfony\Component\DependencyInjection\Extension\ExtensionInterface;
use Symfony\Component\DependencyInjection\Loader\YamlFileLoader;
final class EzPlatformSearchBinaryExtractorExtension extends Extension implements ExtensionInterface
{
public function load(array $configs, ContainerBuilder $container): void
{
$locator = new FileLocator(__DIR__ . '/../Resources/config');
$loader = new YamlFileLoader($container, $locator);
$loader->load('services.yml');
}
}
<?php
declare(strict_types=1);
namespace ContextualCode\EzPlatformSearchBinaryExtractorBundle;
use ContextualCode\EzPlatformSearchBinaryExtractorBundle\DependencyInjection\EzPlatformSearchBinaryExtractorExtension;
use Symfony\Component\DependencyInjection\Extension\ExtensionInterface;
use Symfony\Component\HttpKernel\Bundle\Bundle;
final class EzPlatformSearchBinaryExtractorBundle extends Bundle
{
public function getContainerExtension(): ExtensionInterface
{
return new EzPlatformSearchBinaryExtractorExtension();
}
}
<?php
namespace ContextualCode\EzPlatformSearchBinaryExtractorBundle\FieldType\BinaryFile;
use eZ\Publish\SPI\Persistence\Content\Field;
interface BinaryExtractor
{
public function supports(Field $field): bool;
public function extract(Field $field): ?string;
}
<?php
declare(strict_types=1);
namespace ContextualCode\EzPlatformSearchBinaryExtractorBundle\FieldType\BinaryFile\BinaryExtractor;
use ContextualCode\EzPlatformSearchBinaryExtractorBundle\FieldType\BinaryFile\BinaryExtractor;
use eZ\Publish\Core\IO\IOService;
use eZ\Publish\Core\IO\Values\BinaryFile;
use eZ\Publish\SPI\Persistence\Content\Field;
use Symfony\Component\Process\Process;
class Pdf implements BinaryExtractor
{
protected const SUPPORTED_MIMETYPES = ['application/pdf'];
/** @var IOService */
private $ioService;
public function __construct(IOService $ioService)
{
$this->ioService = $ioService;
}
public function supports(Field $field): bool
{
return in_array($field->value->externalData['mimeType'], self::SUPPORTED_MIMETYPES, true);
}
public function extract(Field $field): ?string
{
$file = $this->ioService->loadBinaryFile($field->value->externalData['id']);
if ($file instanceof BinaryFile === false) {
return null;
}
if (!$this->ioService->exists($file->id)) {
return null;
}
// DFS/AWS binary data handler might be in use, so we need to file contnet into the local file
$tmpFile = tmpfile();
fwrite($tmpFile, $this->ioService->getFileContents($file));
$tmpMetadata = stream_get_meta_data($tmpFile);
$text = $this->pdfToText($tmpMetadata['uri']);
fclose($tmpFile);
return $text;
}
protected function pdfToText(string $filepath): ?string
{
$process = new Process(['bin/pdftotext', '-nopgbrk', '-q', $filepath, '-']);
$process->run();
return trim($process->getOutput());
}
}
<?php
declare(strict_types=1);
namespace ContextualCode\EzPlatformSearchBinaryExtractorBundle\FieldType\BinaryFile;
use eZ\Publish\Core\FieldType\BinaryFile\SearchField as Base;
use eZ\Publish\SPI\Persistence\Content\Field;
use eZ\Publish\SPI\Persistence\Content\Type\FieldDefinition;
use eZ\Publish\SPI\Search;
class SearchField extends Base
{
/** @var BinaryExtractor[] */
protected $binaryExtractors;
public function __construct(iterable $binaryExtractors)
{
$this->binaryExtractors = $binaryExtractors;
}
public function getIndexData(Field $field, FieldDefinition $fieldDefinition): array
{
$indexData = parent::getIndexData($field, $fieldDefinition);
$indexData[] = new Search\Field(
'file_content',
$this->extractFileContent($field),
new Search\FieldType\FullTextField()
);
return $indexData;
}
public function getIndexDefinition()
{
$indexDefinition = parent::getIndexDefinition();
$indexDefinition['file_content'] = new Search\FieldType\TextField();
return $indexDefinition;
}
protected function extractFileContent(Field $field): ?string
{
foreach ($this->binaryExtractors as $extractor) {
if ($extractor instanceof BinaryExtractor && $extractor->supports($field)) {
return $extractor->extract($field);
}
}
return null;
}
}
services:
ezpublish.fieldType.indexable.ezbinaryfile:
class: ContextualCode\EzPlatformSearchBinaryExtractorBundle\FieldType\BinaryFile\SearchField
arguments:
- !tagged ezplatform.field_type.ezbinaryfile.binary_extractor
tags:
- {name: ezpublish.fieldType.indexable, alias: ezbinaryfile}
ContextualCode\EzPlatformSearchBinaryExtractorBundle\FieldType\BinaryFile\BinaryExtractor\Pdf:
arguments:
- "@ezpublish.fieldType.ezbinaryfile.io_service"
tags: [ezplatform.field_type.ezbinaryfile.binary_extractor]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment