[FEATURE] Trigger metadata extraction after file upload
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Resource / Index / Indexer.php
1 <?php
2 namespace TYPO3\CMS\Core\Resource\Index;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Resource\ResourceStorage;
18 use TYPO3\CMS\Core\Resource\File;
19 use TYPO3\CMS\Core\Type\File\ImageInfo;
20 use TYPO3\CMS\Core\Utility\GeneralUtility;
21
22 /**
23 * The New FAL Indexer
24 */
25 class Indexer {
26
27 /**
28 * @var array
29 */
30 protected $filesToUpdate = array();
31
32 /**
33 * @var int[]
34 */
35 protected $identifiedFileUids = array();
36
37 /**
38 * @var ResourceStorage
39 */
40 protected $storage = NULL;
41
42 /**
43 * @var ExtractorInterface[]
44 */
45 protected $extractionServices = NULL;
46
47 /**
48 * @param ResourceStorage $storage
49 */
50 public function __construct(ResourceStorage $storage) {
51 $this->storage = $storage;
52 }
53
54 /**
55 * Create index entry
56 *
57 * @param string $identifier
58 * @return File
59 * @throws \InvalidArgumentException
60 */
61 public function createIndexEntry($identifier) {
62 if (!isset($identifier) || !is_string($identifier) || $identifier === '') {
63 throw new \InvalidArgumentException('Invalid file identifier given. It must be of type string and not empty. "' . gettype($identifier) . '" given.', 1401732565);
64 }
65 $fileProperties = $this->gatherFileInformationArray($identifier);
66 $record = $this->getFileIndexRepository()->addRaw($fileProperties);
67 $fileObject = $this->getResourceFactory()->getFileObject($record['uid'], $record);
68 $this->extractRequiredMetaData($fileObject);
69 return $fileObject;
70 }
71
72 /**
73 * Update index entry
74 *
75 * @param File $fileObject
76 * @return void
77 */
78 public function updateIndexEntry(File $fileObject) {
79 $updatedInformation = $this->gatherFileInformationArray($fileObject->getIdentifier());
80 $fileObject->updateProperties($updatedInformation);
81 $this->getFileIndexRepository()->update($fileObject);
82 $this->extractRequiredMetaData($fileObject);
83 }
84
85 /**
86 * @return void
87 */
88 public function processChangesInStorages() {
89 // get all file-identifiers from the storage
90 $availableFiles = $this->storage->getFileIdentifiersInFolder($this->storage->getRootLevelFolder()->getIdentifier(), TRUE, TRUE);
91 $this->detectChangedFilesInStorage($availableFiles);
92 $this->processChangedAndNewFiles();
93
94 $this->detectMissingFiles();
95 }
96
97 /**
98 * @param int $maximumFileCount
99 * @return void
100 */
101 public function runMetaDataExtraction($maximumFileCount = -1) {
102 $fileIndexRecords = $this->getFileIndexRepository()->findInStorageWithIndexOutstanding($this->storage, $maximumFileCount);
103 foreach ($fileIndexRecords as $indexRecord) {
104 $fileObject = $this->getResourceFactory()->getFileObject($indexRecord['uid'], $indexRecord);
105 $this->extractMetaData($fileObject);
106 }
107 }
108
109 /**
110 * Extract metadata for given fileObject
111 *
112 * @param File $fileObject
113 */
114 public function extractMetaData(File $fileObject) {
115 $newMetaData = array(
116 0 => $fileObject->_getMetaData()
117 );
118 foreach ($this->getExtractionServices() as $service) {
119 if ($service->canProcess($fileObject)) {
120 $newMetaData[$service->getPriority()] = $service->extractMetaData($fileObject, $newMetaData);
121 }
122 }
123 ksort($newMetaData);
124 $metaData = array();
125 foreach ($newMetaData as $data) {
126 $metaData = array_merge($metaData, $data);
127 }
128 $fileObject->_updateMetaDataProperties($metaData);
129 $this->getMetaDataRepository()->update($fileObject->getUid(), $metaData);
130 $this->getFileIndexRepository()->updateIndexingTime($fileObject->getUid());
131 }
132
133 /**
134 * Get available extraction services
135 *
136 * @return ExtractorInterface[]
137 */
138 protected function getExtractionServices() {
139 if ($this->extractionServices === NULL) {
140 $this->extractionServices = $this->getExtractorRegistry()->getExtractorsWithDriverSupport($this->storage->getDriverType());
141 }
142 return $this->extractionServices;
143 }
144
145 /**
146 * Since by now all files in filesystem have been looked at it is save to assume,
147 * that files that are in indexed but not touched in this run are missing
148 */
149 protected function detectMissingFiles() {
150 $indexedNotExistentFiles = $this->getFileIndexRepository()->findInStorageAndNotInUidList(
151 $this->storage,
152 $this->identifiedFileUids
153 );
154
155 foreach ($indexedNotExistentFiles as $record) {
156 if (!$this->storage->hasFile($record['identifier'])) {
157 $this->getFileIndexRepository()->markFileAsMissing($record['uid']);
158 }
159 }
160 }
161
162 /**
163 * Adds updated files to the processing queue
164 *
165 * @param array $fileIdentifierArray
166 * @return void
167 */
168 protected function detectChangedFilesInStorage(array $fileIdentifierArray) {
169 foreach ($fileIdentifierArray as $fileIdentifier) {
170 // skip processed files
171 if ($this->storage->isWithinProcessingFolder($fileIdentifier)) {
172 continue;
173 }
174 // Get the modification time for file-identifier from the storage
175 $modificationTime = $this->storage->getFileInfoByIdentifier($fileIdentifier, array('mtime'));
176 // Look if the the modification time in FS is higher than the one in database (key needed on timestamps)
177 $indexRecord = $this->getFileIndexRepository()->findOneByStorageUidAndIdentifier($this->storage->getUid(), $fileIdentifier);
178
179 if ($indexRecord !== FALSE) {
180 $this->identifiedFileUids[] = $indexRecord['uid'];
181
182 if ($indexRecord['modification_date'] !== $modificationTime['mtime'] || $indexRecord['missing']) {
183 $this->filesToUpdate[$fileIdentifier] = $indexRecord;
184 }
185 } else {
186 $this->filesToUpdate[$fileIdentifier] = NULL;
187 }
188 }
189 }
190
191 /**
192 * Processes the Files which have been detected as "changed or new"
193 * in the storage
194 *
195 * @return void
196 */
197 protected function processChangedAndNewFiles() {
198 foreach ($this->filesToUpdate AS $identifier => $data) {
199 if ($data == NULL) {
200 // search for files with same content hash in indexed storage
201 $fileHash = $this->storage->hashFileByIdentifier($identifier, 'sha1');
202 $files = $this->getFileIndexRepository()->findByContentHash($fileHash);
203 $fileObject = NULL;
204 if (!empty($files)) {
205 foreach ($files as $fileIndexEntry) {
206 // check if file is missing then we assume it's moved/renamed
207 if (!$this->storage->hasFile($fileIndexEntry['identifier'])) {
208 $fileObject = $this->getResourceFactory()->getFileObject($fileIndexEntry['uid'], $fileIndexEntry);
209 $fileObject->updateProperties(array(
210 'identifier' => $identifier
211 ));
212 $this->updateIndexEntry($fileObject);
213 $this->identifiedFileUids[] = $fileObject->getUid();
214 break;
215 }
216 }
217 }
218 // create new index when no missing file with same content hash is found
219 if ($fileObject === NULL) {
220 $fileObject = $this->createIndexEntry($identifier);
221 $this->identifiedFileUids[] = $fileObject->getUid();
222 }
223 } else {
224 // update existing file
225 $fileObject = $this->getResourceFactory()->getFileObject($data['uid'], $data);
226 $this->updateIndexEntry($fileObject);
227 }
228 }
229 }
230
231 /**
232 * Since the core desperately needs image sizes in metadata table put them there
233 * This should be called after every "content" update and "record" creation
234 *
235 * @param File $fileObject
236 */
237 protected function extractRequiredMetaData(File $fileObject) {
238 // since the core desperately needs image sizes in metadata table do this manually
239 // prevent doing this for remote storages, remote storages must provide the data with extractors
240 if ($fileObject->getType() == File::FILETYPE_IMAGE && $this->storage->getDriverType() === 'Local') {
241 $rawFileLocation = $fileObject->getForLocalProcessing(FALSE);
242 $imageInfo = GeneralUtility::makeInstance(ImageInfo::class, $rawFileLocation);
243 $metaData = array(
244 'width' => $imageInfo->getWidth(),
245 'height' => $imageInfo->getHeight(),
246 );
247 $this->getMetaDataRepository()->update($fileObject->getUid(), $metaData);
248 $fileObject->_updateMetaDataProperties($metaData);
249 }
250 }
251
252 /****************************
253 *
254 * UTILITY
255 *
256 ****************************/
257
258 /**
259 * Collects the information to be cached in sys_file
260 *
261 * @param string $identifier
262 * @return array
263 */
264 protected function gatherFileInformationArray($identifier) {
265 $fileInfo = $this->storage->getFileInfoByIdentifier($identifier);
266 $fileInfo = $this->transformFromDriverFileInfoArrayToFileObjectFormat($fileInfo);
267 $fileInfo['type'] = $this->getFileType($fileInfo['mime_type']);
268 $fileInfo['sha1'] = $this->storage->hashFileByIdentifier($identifier, 'sha1');
269 $fileInfo['extension'] = \TYPO3\CMS\Core\Utility\PathUtility::pathinfo($fileInfo['name'], PATHINFO_EXTENSION);
270 $fileInfo['missing'] = 0;
271
272 return $fileInfo;
273 }
274
275 /**
276 * Maps the mimetype to a sys_file table type
277 *
278 * @param string $mimeType
279 * @return string
280 */
281 protected function getFileType($mimeType) {
282 list($fileType) = explode('/', $mimeType);
283 switch (strtolower($fileType)) {
284 case 'text':
285 $type = File::FILETYPE_TEXT;
286 break;
287 case 'image':
288 $type = File::FILETYPE_IMAGE;
289 break;
290 case 'audio':
291 $type = File::FILETYPE_AUDIO;
292 break;
293 case 'video':
294 $type = File::FILETYPE_VIDEO;
295 break;
296 case 'application':
297 case 'software':
298 $type = File::FILETYPE_APPLICATION;
299 break;
300 default:
301 $type = File::FILETYPE_UNKNOWN;
302 }
303 return $type;
304 }
305
306 /**
307 * However it happened, the properties of a file object which
308 * are persisted to the database are named different than the
309 * properties the driver returns in getFileInfo.
310 * Therefore a mapping must happen.
311 *
312 * @param array $fileInfo
313 *
314 * @return array
315 */
316 protected function transformFromDriverFileInfoArrayToFileObjectFormat(array $fileInfo) {
317 $mappingInfo = array(
318 // 'driverKey' => 'fileProperty' Key is from the driver, value is for the property in the file
319 'size' => 'size',
320 'atime' => NULL,
321 'mtime' => 'modification_date',
322 'ctime' => 'creation_date',
323 'mimetype' => 'mime_type'
324 );
325 $mappedFileInfo = array();
326 foreach ($fileInfo as $key => $value) {
327 if (array_key_exists($key, $mappingInfo)) {
328 if ($mappingInfo[$key] !== NULL) {
329 $mappedFileInfo[$mappingInfo[$key]] = $value;
330 }
331 } else {
332 $mappedFileInfo[$key] = $value;
333 }
334 }
335 return $mappedFileInfo;
336 }
337
338
339 /**
340 * Returns an instance of the FileIndexRepository
341 *
342 * @return FileIndexRepository
343 */
344 protected function getFileIndexRepository() {
345 return FileIndexRepository::getInstance();
346 }
347
348 /**
349 * Returns an instance of the FileIndexRepository
350 *
351 * @return MetaDataRepository
352 */
353 protected function getMetaDataRepository() {
354 return MetaDataRepository::getInstance();
355 }
356
357 /**
358 * Returns the ResourceFactory
359 *
360 * @return \TYPO3\CMS\Core\Resource\ResourceFactory
361 */
362 protected function getResourceFactory() {
363 return \TYPO3\CMS\Core\Resource\ResourceFactory::getInstance();
364 }
365
366 /**
367 * Returns an instance of the FileIndexRepository
368 *
369 * @return ExtractorRegistry
370 */
371 protected function getExtractorRegistry() {
372 return ExtractorRegistry::getInstance();
373 }
374
375 }