[FEATURE] Get a file's text content if possible 56/36556/9
authorIngo Renner <ingo@typo3.org>
Sun, 1 Feb 2015 06:53:22 +0000 (22:53 -0800)
committerStefan Froemken <froemken@gmail.com>
Fri, 13 Mar 2015 08:42:08 +0000 (09:42 +0100)
Currently FAL only allows to extract meta data from files. This
patch allows to also extract text content from files. This can
be useful for search engines or providing snippets/teasers
in document archives.

Multiple text extractors can be registered to allow dealing with
different file types. A plain text extractor is provided by the core.

This is also a successor to the former textExtract service interface
implemented by several extensions: http://bit.ly/1D0x92M

Fixes: #36743
Releases: master
Change-Id: I1ce414c99fb26413eedd32422821e1a8802010de
Reviewed-on: http://review.typo3.org/36556
Reviewed-by: Frans Saris <franssaris@gmail.com>
Tested-by: Frans Saris <franssaris@gmail.com>
Reviewed-by: Frank Nägler <typo3@naegler.net>
Tested-by: Frank Nägler <typo3@naegler.net>
Reviewed-by: Stefan Froemken <froemken@gmail.com>
Tested-by: Stefan Froemken <froemken@gmail.com>
typo3/sysext/core/Classes/Resource/TextExtraction/PlainTextExtractor.php [new file with mode: 0644]
typo3/sysext/core/Classes/Resource/TextExtraction/TextExtractorInterface.php [new file with mode: 0644]
typo3/sysext/core/Classes/Resource/TextExtraction/TextExtractorRegistry.php [new file with mode: 0644]
typo3/sysext/core/Documentation/Changelog/master/Feature-36743-FAL-TextExtractorRegistry.rst [new file with mode: 0644]
typo3/sysext/core/Tests/Unit/Resource/TextExtraction/PlainTextExtractorTest.php [new file with mode: 0644]
typo3/sysext/core/Tests/Unit/Resource/TextExtraction/TextExtractorRegistryTest.php [new file with mode: 0644]
typo3/sysext/core/ext_localconf.php

diff --git a/typo3/sysext/core/Classes/Resource/TextExtraction/PlainTextExtractor.php b/typo3/sysext/core/Classes/Resource/TextExtraction/PlainTextExtractor.php
new file mode 100644 (file)
index 0000000..04281ff
--- /dev/null
@@ -0,0 +1,65 @@
+<?php
+namespace TYPO3\CMS\Core\Resource\TextExtraction;
+
+/*
+ * This file is part of the TYPO3 CMS project.
+ *
+ * It is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License, either version 2
+ * of the License, or any later version.
+ *
+ * For the full copyright and license information, please read the
+ * LICENSE.txt file that was distributed with this source code.
+ *
+ * The TYPO3 project - inspiring people to share!
+ */
+
+use TYPO3\CMS\Core\Resource\FileInterface;
+use TYPO3\CMS\Core\Utility\PathUtility;
+
+/**
+ * A simple text extractor to extract text from plain text files.
+ *
+ */
+class PlainTextExtractor implements TextExtractorInterface {
+
+       /**
+        * Checks if the given file can be read by this extractor
+        *
+        * @param FileInterface $file
+        * @return bool
+        */
+       public function canExtractText(FileInterface $file) {
+               $canExtract = FALSE;
+
+               if ($file->getMimeType() === 'text/plain') {
+                       $canExtract = TRUE;
+               }
+
+               return $canExtract;
+       }
+
+       /**
+        * The actual text extraction.
+        *
+        * @param FileInterface $file
+        * @return string
+        */
+       public function extractText(FileInterface $file) {
+               $localTempFile = $file->getForLocalProcessing(FALSE);
+
+               // extract text
+               $content = file_get_contents($localTempFile);
+
+               // In case of remote storage, the temporary copy of the
+               // original file in typo3temp must be removed
+               // Simply compare the filenames, because the filename is so unique that
+               // it is nearly impossible to have a file with this name in a storage
+               if (PathUtility::basename($localTempFile) !== $file->getName()) {
+                       unlink($localTempFile);
+               }
+
+               return $content;
+       }
+
+}
diff --git a/typo3/sysext/core/Classes/Resource/TextExtraction/TextExtractorInterface.php b/typo3/sysext/core/Classes/Resource/TextExtraction/TextExtractorInterface.php
new file mode 100644 (file)
index 0000000..8ccf646
--- /dev/null
@@ -0,0 +1,44 @@
+<?php
+namespace TYPO3\CMS\Core\Resource\TextExtraction;
+
+/*
+ * This file is part of the TYPO3 CMS project.
+ *
+ * It is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License, either version 2
+ * of the License, or any later version.
+ *
+ * For the full copyright and license information, please read the
+ * LICENSE.txt file that was distributed with this source code.
+ *
+ * The TYPO3 project - inspiring people to share!
+ */
+
+use TYPO3\CMS\Core\Resource\FileInterface;
+
+
+/**
+ * An interface for text extractors
+ *
+ */
+interface TextExtractorInterface {
+
+       /**
+        * Checks if the given file can be read by this extractor
+        *
+        * @param FileInterface $file
+        * @return bool
+        */
+       public function canExtractText(FileInterface $file);
+
+       /**
+        * The actual text extraction.
+        *
+        * Should return a string of the file's content
+        *
+        * @param FileInterface $file
+        * @return string
+        */
+       public function extractText(FileInterface $file);
+
+}
diff --git a/typo3/sysext/core/Classes/Resource/TextExtraction/TextExtractorRegistry.php b/typo3/sysext/core/Classes/Resource/TextExtraction/TextExtractorRegistry.php
new file mode 100644 (file)
index 0000000..05a6485
--- /dev/null
@@ -0,0 +1,113 @@
+<?php
+namespace TYPO3\CMS\Core\Resource\TextExtraction;
+
+/*
+ * This file is part of the TYPO3 CMS project.
+ *
+ * It is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License, either version 2
+ * of the License, or any later version.
+ *
+ * For the full copyright and license information, please read the
+ * LICENSE.txt file that was distributed with this source code.
+ *
+ * The TYPO3 project - inspiring people to share!
+ */
+
+use TYPO3\CMS\Core\Resource\FileInterface;
+use TYPO3\CMS\Core\SingletonInterface;
+use TYPO3\CMS\Core\Utility\GeneralUtility;
+
+
+/**
+ * Class TextExtractorRegistry
+ *
+ */
+class TextExtractorRegistry implements SingletonInterface {
+
+       /**
+        * Registered text extractor class names
+        *
+        * @var array
+        */
+       protected $textExtractorClasses = array();
+
+       /**
+        * Instance cache for text extractor classes
+        *
+        * @var TextExtractorInterface[]
+        */
+       protected $instances = array();
+
+
+       /**
+        * Returns an instance of this class
+        *
+        * @return TextExtractorRegistry
+        */
+       static public function getInstance() {
+               return GeneralUtility::makeInstance(self::class);
+       }
+
+       /**
+        * Allows to register a text extractor class
+        *
+        * @param string $className
+        * @throws \InvalidArgumentException
+        */
+       public function registerTextExtractor($className) {
+               if (!class_exists($className)) {
+                       throw new \InvalidArgumentException('The class "' . $className . '" you are trying to register is not available', 1422906893);
+               }
+
+               if (!in_array(TextExtractorInterface::class, class_implements($className), TRUE)) {
+                       throw new \InvalidArgumentException($className . ' must implement interface' . TextExtractorInterface::class, 1422771427);
+               }
+
+               $this->textExtractorClasses[] = $className;
+       }
+
+       /**
+        * Get all registered text extractor instances
+        *
+        * @return TextExtractorInterface[]
+        */
+       public function getTextExtractorInstances() {
+               if (empty($this->instances) && !empty($this->textExtractorClasses)) {
+                       foreach ($this->textExtractorClasses as $className) {
+                               $object = $this->createTextExtractorInstance($className);
+                               $this->instances[] = $object;
+                       }
+               }
+
+               return $this->instances;
+       }
+
+       /**
+        * Create an instance of a certain text extractor class
+        *
+        * @param string $className
+        * @return TextExtractorInterface
+        */
+       protected function createTextExtractorInstance($className) {
+               return GeneralUtility::makeInstance($className);
+       }
+
+       /**
+        * Checks whether any registered text extractor can deal with a given file
+        * and returns it.
+        *
+        * @param FileInterface $file
+        * @return NULL|TextExtractorInterface
+        */
+       public function getTextExtractor(FileInterface $file) {
+               foreach ($this->getTextExtractorInstances() as $textExtractor) {
+                       if ($textExtractor->canExtractText($file)) {
+                               return $textExtractor;
+                       }
+               }
+
+               return NULL;
+       }
+
+}
diff --git a/typo3/sysext/core/Documentation/Changelog/master/Feature-36743-FAL-TextExtractorRegistry.rst b/typo3/sysext/core/Documentation/Changelog/master/Feature-36743-FAL-TextExtractorRegistry.rst
new file mode 100644 (file)
index 0000000..ce00ccc
--- /dev/null
@@ -0,0 +1,62 @@
+============================================================
+Feature: #36743 - Registry for adding text extractor classes
+============================================================
+
+Description
+===========
+
+Text extraction from files is a complex task. Thus it would be un-wise to
+implement that over and over again if needed. By providing a registry text
+extraction services can be provided to other extensions.
+
+It is expected that there won't ever be a lot of implementations for text
+extractors.
+The core ships with an extractor for plain text files (.txt file extension).
+
+When asking the registry to provide a text extractor for a file it will "ask"
+the registered text extractors whether they can read the file. The first text
+extractor returning TRUE will be returned and can then be used to actually
+read/extract text from the file.
+
+Every registered text extractor class needs to implements the
+TextExtractorInterface with the following methods:
+
+- canExtractText() gets a file reference and returns TRUE if the text extractor
+  can extract text from that file. How this is determined is up to the text
+  extractor, f.e. by using MIME type or file extension as indicators.
+- extractText() gets a file reference and is expected to return the file's text
+  content as string.
+
+It is possible to register your own text extractor classes in the
+ext_localconf.php of an extension.
+
+Examples
+--------
+
+Text extractor registration
+
+.. code-block:: php
+
+       $textExtractorRegistry = \TYPO3\CMS\Core\Resource\TextExtraction\TextExtractorRegistry::getInstance();
+       $textExtractorRegistry->registerTextExtractor(
+               \TYPO3\CMS\Core\Resource\TextExtraction\PlainTextExtractor::class
+       );
+
+
+Usage
+
+.. code-block:: php
+
+       $textExtractorRegistry = \TYPO3\CMS\Core\Resource\TextExtraction\TextExtractorRegistry::getInstance();
+       $extractor = $textExtractorRegistry->getTextExtractor($file);
+       if ($extractor !== NULL) {
+               $content = $extractor->extractText($file);
+       }
+
+
+Impact
+======
+
+The registry on its own doesn't do anything. It provides a facility in the core
+that allows extensions to provide text extraction services to be used by other
+extensions.
diff --git a/typo3/sysext/core/Tests/Unit/Resource/TextExtraction/PlainTextExtractorTest.php b/typo3/sysext/core/Tests/Unit/Resource/TextExtraction/PlainTextExtractorTest.php
new file mode 100644 (file)
index 0000000..adfb86d
--- /dev/null
@@ -0,0 +1,50 @@
+<?php
+namespace TYPO3\CMS\Core\Tests\Unit\Resource\TextExtraction;
+
+/*
+ * This file is part of the TYPO3 CMS project.
+ *
+ * It is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License, either version 2
+ * of the License, or any later version.
+ *
+ * For the full copyright and license information, please read the
+ * LICENSE.txt file that was distributed with this source code.
+ *
+ * The TYPO3 project - inspiring people to share!
+ */
+
+use TYPO3\CMS\Core\Resource\File;
+use TYPO3\CMS\Core\Resource\TextExtraction\PlainTextExtractor;
+
+
+/**
+ * Class PlainTextExtractorTest
+ */
+class PlainTextExtractorTest extends \TYPO3\CMS\Core\Tests\UnitTestCase {
+
+       /**
+        * @test
+        */
+       public function canExtractTextReturnsTrueForPlainTextFiles() {
+               $plainTextExtractor = new PlainTextExtractor();
+
+               $fileResourceMock = $this->getMock(File::class, array(), array(), '', FALSE);
+               $fileResourceMock->expects($this->any())->method('getMimeType')->will($this->returnValue('text/plain'));
+
+               $this->assertTrue($plainTextExtractor->canExtractText($fileResourceMock));
+       }
+
+       /**
+        * @test
+        */
+       public function canExtractTextReturnsFalseForNonPlainTextFiles() {
+               $plainTextExtractor = new PlainTextExtractor();
+
+               $fileResourceMock = $this->getMock(File::class, array(), array(), '', FALSE);
+               $fileResourceMock->expects($this->any())->method('getMimeType')->will($this->returnValue('video/mp4'));
+
+               $this->assertFalse($plainTextExtractor->canExtractText($fileResourceMock));
+       }
+
+}
diff --git a/typo3/sysext/core/Tests/Unit/Resource/TextExtraction/TextExtractorRegistryTest.php b/typo3/sysext/core/Tests/Unit/Resource/TextExtraction/TextExtractorRegistryTest.php
new file mode 100644 (file)
index 0000000..9a74219
--- /dev/null
@@ -0,0 +1,79 @@
+<?php
+namespace TYPO3\CMS\Core\Tests\Unit\Resource\TextExtraction;
+
+/*
+ * This file is part of the TYPO3 CMS project.
+ *
+ * It is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License, either version 2
+ * of the License, or any later version.
+ *
+ * For the full copyright and license information, please read the
+ * LICENSE.txt file that was distributed with this source code.
+ *
+ * The TYPO3 project - inspiring people to share!
+ */
+
+use TYPO3\CMS\Core\Resource\TextExtraction\TextExtractorRegistry;
+use TYPO3\CMS\Core\Resource\TextExtraction\TextExtractorInterface;
+
+
+/**
+ * Test cases for TextExtractorRegistry
+ */
+class TextExtractorRegistryTest extends \TYPO3\CMS\Core\Tests\UnitTestCase {
+
+       /**
+        * Initialize a TextExtractorRegistry and mock createTextExtractorInstance()
+        *
+        * @param array $createsTextExtractorInstances
+        * @return \PHPUnit_Framework_MockObject_MockObject|TextExtractorRegistry
+        */
+       protected function getTextExtractorRegistry(array $createsTextExtractorInstances = array()) {
+               $textExtractorRegistry = $this->getMockBuilder(TextExtractorRegistry::class)
+                       ->setMethods(array('createTextExtractorInstance'))
+                       ->getMock();
+
+               if (count($createsTextExtractorInstances)) {
+                       $textExtractorRegistry->expects($this->any())
+                               ->method('createTextExtractorInstance')
+                               ->will($this->returnValueMap($createsTextExtractorInstances));
+               }
+
+               return $textExtractorRegistry;
+       }
+
+       /**
+        * @test
+        */
+       public function registeredTextExtractorClassCanBeRetrieved() {
+               $textExtractorClass = $this->getUniqueId('myTextExtractor');
+               $textExtractorInstance = $this->getMock(TextExtractorInterface::class, array(), array(), $textExtractorClass);
+
+               $textExtractorRegistry = $this->getTextExtractorRegistry(array(array($textExtractorClass, $textExtractorInstance)));
+
+               $textExtractorRegistry->registerTextExtractor($textExtractorClass);
+               $this->assertContains($textExtractorInstance, $textExtractorRegistry->getTextExtractorInstances(), '', FALSE, FALSE);
+       }
+
+       /**
+        * @test
+        * @expectedException \InvalidArgumentException
+        * @expectedExceptionCode 1422906893
+        */
+       public function registerTextExtractorThrowsExceptionIfClassDoesNotExist() {
+               $textExtractorRegistry = $this->getTextExtractorRegistry();
+               $textExtractorRegistry->registerTextExtractor($this->getUniqueId());
+       }
+
+       /**
+        * @test
+        * @expectedException \InvalidArgumentException
+        * @expectedExceptionCode 1422771427
+        */
+       public function registerTextExtractorThrowsExceptionIfClassDoesNotImplementRightInterface() {
+               $textExtractorRegistry = $this->getTextExtractorRegistry();
+               $textExtractorRegistry->registerTextExtractor(__CLASS__);
+       }
+
+}
index 783eebd..87eb9ae 100644 (file)
@@ -67,3 +67,6 @@ $GLOBALS['TYPO3_CONF_VARS']['FE']['eID_include']['dumpFile'] = 'EXT:core/Resourc
 $rendererRegistry = \TYPO3\CMS\Core\Resource\Rendering\RendererRegistry::getInstance();
 $rendererRegistry->registerRendererClass(\TYPO3\CMS\Core\Resource\Rendering\AudioTagRenderer::class);
 $rendererRegistry->registerRendererClass(\TYPO3\CMS\Core\Resource\Rendering\VideoTagRenderer::class);
+
+$textExtractorRegistry = \TYPO3\CMS\Core\Resource\TextExtraction\TextExtractorRegistry::getInstance();
+$textExtractorRegistry->registerTextExtractor(\TYPO3\CMS\Core\Resource\TextExtraction\PlainTextExtractor::class);