[BUGFIX] FileStorageExtractionTask breaks if file not found
[Packages/TYPO3.CMS.git] / typo3 / sysext / core / Classes / Resource / Index / Indexer.php
1 <?php
2 namespace TYPO3\CMS\Core\Resource\Index;
3
4 /*
5 * This file is part of the TYPO3 CMS project.
6 *
7 * It is free software; you can redistribute it and/or modify it under
8 * the terms of the GNU General Public License, either version 2
9 * of the License, or any later version.
10 *
11 * For the full copyright and license information, please read the
12 * LICENSE.txt file that was distributed with this source code.
13 *
14 * The TYPO3 project - inspiring people to share!
15 */
16
17 use TYPO3\CMS\Core\Resource\File;
18 use TYPO3\CMS\Core\Resource\ResourceFactory;
19 use TYPO3\CMS\Core\Resource\ResourceStorage;
20 use TYPO3\CMS\Core\Type\File\ImageInfo;
21 use TYPO3\CMS\Core\Utility\GeneralUtility;
22 use TYPO3\CMS\Core\Utility\PathUtility;
23
24 /**
25 * The New FAL Indexer
26 */
27 class Indexer
28 {
29 /**
30 * @var array
31 */
32 protected $filesToUpdate = [];
33
34 /**
35 * @var int[]
36 */
37 protected $identifiedFileUids = [];
38
39 /**
40 * @var ResourceStorage
41 */
42 protected $storage = null;
43
44 /**
45 * @var ExtractorInterface[]
46 */
47 protected $extractionServices = null;
48
49 /**
50 * @param ResourceStorage $storage
51 */
52 public function __construct(ResourceStorage $storage)
53 {
54 $this->storage = $storage;
55 }
56
57 /**
58 * Create index entry
59 *
60 * @param string $identifier
61 * @return File
62 * @throws \InvalidArgumentException
63 */
64 public function createIndexEntry($identifier)
65 {
66 if (!isset($identifier) || !is_string($identifier) || $identifier === '') {
67 throw new \InvalidArgumentException('Invalid file identifier given. It must be of type string and not empty. "' . gettype($identifier) . '" given.', 1401732565);
68 }
69 $fileProperties = $this->gatherFileInformationArray($identifier);
70 $record = $this->getFileIndexRepository()->addRaw($fileProperties);
71 $fileObject = $this->getResourceFactory()->getFileObject($record['uid'], $record);
72 $this->extractRequiredMetaData($fileObject);
73 return $fileObject;
74 }
75
76 /**
77 * Update index entry
78 *
79 * @param File $fileObject
80 */
81 public function updateIndexEntry(File $fileObject)
82 {
83 $updatedInformation = $this->gatherFileInformationArray($fileObject->getIdentifier());
84 $fileObject->updateProperties($updatedInformation);
85 $this->getFileIndexRepository()->update($fileObject);
86 $this->extractRequiredMetaData($fileObject);
87 }
88
89 /**
90 */
91 public function processChangesInStorages()
92 {
93 // get all file-identifiers from the storage
94 $availableFiles = $this->storage->getFileIdentifiersInFolder($this->storage->getRootLevelFolder(false)->getIdentifier(), true, true);
95 $this->detectChangedFilesInStorage($availableFiles);
96 $this->processChangedAndNewFiles();
97
98 $this->detectMissingFiles();
99 }
100
101 /**
102 * @param int $maximumFileCount
103 */
104 public function runMetaDataExtraction($maximumFileCount = -1)
105 {
106 $fileIndexRecords = $this->getFileIndexRepository()->findInStorageWithIndexOutstanding($this->storage, $maximumFileCount);
107 foreach ($fileIndexRecords as $indexRecord) {
108 $fileObject = $this->getResourceFactory()->getFileObject($indexRecord['uid'], $indexRecord);
109 // Check for existence of file before extraction
110 if ($fileObject->exists()) {
111 $this->extractMetaData($fileObject);
112
113 // Mark file as missing and continue with next record
114 } else {
115 $this->getFileIndexRepository()->markFileAsMissing($indexRecord['uid']);
116 }
117 }
118 }
119
120 /**
121 * Extract metadata for given fileObject
122 *
123 * @param File $fileObject
124 */
125 public function extractMetaData(File $fileObject)
126 {
127 $newMetaData = [
128 0 => $fileObject->_getMetaData()
129 ];
130
131 // Loop through available extractors and fetch metadata for the given file.
132 foreach ($this->getExtractionServices() as $service) {
133 if ($this->isFileTypeSupportedByExtractor($fileObject, $service) && $service->canProcess($fileObject)) {
134 $newMetaData[$service->getPriority()] = $service->extractMetaData($fileObject, $newMetaData);
135 }
136 }
137
138 // Sort metadata by priority so that merging happens in order of precedence.
139 ksort($newMetaData);
140
141 // Merge the collected metadata.
142 $metaData = [];
143 foreach ($newMetaData as $data) {
144 $metaData = array_merge($metaData, $data);
145 }
146 $fileObject->_updateMetaDataProperties($metaData);
147 $this->getMetaDataRepository()->update($fileObject->getUid(), $metaData);
148 $this->getFileIndexRepository()->updateIndexingTime($fileObject->getUid());
149 }
150
151 /**
152 * Get available extraction services
153 *
154 * @return ExtractorInterface[]
155 */
156 protected function getExtractionServices()
157 {
158 if ($this->extractionServices === null) {
159 $this->extractionServices = $this->getExtractorRegistry()->getExtractorsWithDriverSupport($this->storage->getDriverType());
160 }
161 return $this->extractionServices;
162 }
163
164 /**
165 * Since by now all files in filesystem have been looked at it is save to assume,
166 * that files that are in indexed but not touched in this run are missing
167 */
168 protected function detectMissingFiles()
169 {
170 $indexedNotExistentFiles = $this->getFileIndexRepository()->findInStorageAndNotInUidList(
171 $this->storage,
172 $this->identifiedFileUids
173 );
174
175 foreach ($indexedNotExistentFiles as $record) {
176 if (!$this->storage->hasFile($record['identifier'])) {
177 $this->getFileIndexRepository()->markFileAsMissing($record['uid']);
178 }
179 }
180 }
181
182 /**
183 * Check whether the extractor service supports this file according to file type restrictions.
184 *
185 * @param File $file
186 * @param ExtractorInterface $extractor
187 * @return bool
188 */
189 protected function isFileTypeSupportedByExtractor(File $file, ExtractorInterface $extractor)
190 {
191 $isSupported = true;
192 $fileTypeRestrictions = $extractor->getFileTypeRestrictions();
193 if (!empty($fileTypeRestrictions) && !in_array($file->getType(), $fileTypeRestrictions)) {
194 $isSupported = false;
195 }
196 return $isSupported;
197 }
198
199 /**
200 * Adds updated files to the processing queue
201 *
202 * @param array $fileIdentifierArray
203 */
204 protected function detectChangedFilesInStorage(array $fileIdentifierArray)
205 {
206 foreach ($fileIdentifierArray as $fileIdentifier) {
207 // skip processed files
208 if ($this->storage->isWithinProcessingFolder($fileIdentifier)) {
209 continue;
210 }
211 // Get the modification time for file-identifier from the storage
212 $modificationTime = $this->storage->getFileInfoByIdentifier($fileIdentifier, ['mtime']);
213 // Look if the the modification time in FS is higher than the one in database (key needed on timestamps)
214 $indexRecord = $this->getFileIndexRepository()->findOneByStorageUidAndIdentifier($this->storage->getUid(), $fileIdentifier);
215
216 if ($indexRecord !== false) {
217 $this->identifiedFileUids[] = $indexRecord['uid'];
218
219 if ((int)$indexRecord['modification_date'] !== $modificationTime['mtime'] || $indexRecord['missing']) {
220 $this->filesToUpdate[$fileIdentifier] = $indexRecord;
221 }
222 } else {
223 $this->filesToUpdate[$fileIdentifier] = null;
224 }
225 }
226 }
227
228 /**
229 * Processes the Files which have been detected as "changed or new"
230 * in the storage
231 */
232 protected function processChangedAndNewFiles()
233 {
234 foreach ($this->filesToUpdate as $identifier => $data) {
235 if ($data == null) {
236 // search for files with same content hash in indexed storage
237 $fileHash = $this->storage->hashFileByIdentifier($identifier, 'sha1');
238 $files = $this->getFileIndexRepository()->findByContentHash($fileHash);
239 $fileObject = null;
240 if (!empty($files)) {
241 foreach ($files as $fileIndexEntry) {
242 // check if file is missing then we assume it's moved/renamed
243 if (!$this->storage->hasFile($fileIndexEntry['identifier'])) {
244 $fileObject = $this->getResourceFactory()->getFileObject($fileIndexEntry['uid'], $fileIndexEntry);
245 $fileObject->updateProperties([
246 'identifier' => $identifier
247 ]);
248 $this->updateIndexEntry($fileObject);
249 $this->identifiedFileUids[] = $fileObject->getUid();
250 break;
251 }
252 }
253 }
254 // create new index when no missing file with same content hash is found
255 if ($fileObject === null) {
256 $fileObject = $this->createIndexEntry($identifier);
257 $this->identifiedFileUids[] = $fileObject->getUid();
258 }
259 } else {
260 // update existing file
261 $fileObject = $this->getResourceFactory()->getFileObject($data['uid'], $data);
262 $this->updateIndexEntry($fileObject);
263 }
264 }
265 }
266
267 /**
268 * Since the core desperately needs image sizes in metadata table put them there
269 * This should be called after every "content" update and "record" creation
270 *
271 * @param File $fileObject
272 */
273 protected function extractRequiredMetaData(File $fileObject)
274 {
275 // since the core desperately needs image sizes in metadata table do this manually
276 // prevent doing this for remote storages, remote storages must provide the data with extractors
277 if ($fileObject->getType() == File::FILETYPE_IMAGE && $this->storage->getDriverType() === 'Local') {
278 $rawFileLocation = $fileObject->getForLocalProcessing(false);
279 $imageInfo = GeneralUtility::makeInstance(ImageInfo::class, $rawFileLocation);
280 $metaData = [
281 'width' => $imageInfo->getWidth(),
282 'height' => $imageInfo->getHeight(),
283 ];
284 $this->getMetaDataRepository()->update($fileObject->getUid(), $metaData);
285 $fileObject->_updateMetaDataProperties($metaData);
286 }
287 }
288
289 /****************************
290 *
291 * UTILITY
292 *
293 ****************************/
294
295 /**
296 * Collects the information to be cached in sys_file
297 *
298 * @param string $identifier
299 * @return array
300 */
301 protected function gatherFileInformationArray($identifier)
302 {
303 $fileInfo = $this->storage->getFileInfoByIdentifier($identifier);
304 $fileInfo = $this->transformFromDriverFileInfoArrayToFileObjectFormat($fileInfo);
305 $fileInfo['type'] = $this->getFileType($fileInfo['mime_type']);
306 $fileInfo['sha1'] = $this->storage->hashFileByIdentifier($identifier, 'sha1');
307 $fileInfo['extension'] = PathUtility::pathinfo($fileInfo['name'], PATHINFO_EXTENSION);
308 $fileInfo['missing'] = 0;
309
310 return $fileInfo;
311 }
312
313 /**
314 * Maps the mimetype to a sys_file table type
315 *
316 * @param string $mimeType
317 * @return string
318 */
319 protected function getFileType($mimeType)
320 {
321 list($fileType) = explode('/', $mimeType);
322 switch (strtolower($fileType)) {
323 case 'text':
324 $type = File::FILETYPE_TEXT;
325 break;
326 case 'image':
327 $type = File::FILETYPE_IMAGE;
328 break;
329 case 'audio':
330 $type = File::FILETYPE_AUDIO;
331 break;
332 case 'video':
333 $type = File::FILETYPE_VIDEO;
334 break;
335 case 'application':
336 case 'software':
337 $type = File::FILETYPE_APPLICATION;
338 break;
339 default:
340 $type = File::FILETYPE_UNKNOWN;
341 }
342 return $type;
343 }
344
345 /**
346 * However it happened, the properties of a file object which
347 * are persisted to the database are named different than the
348 * properties the driver returns in getFileInfo.
349 * Therefore a mapping must happen.
350 *
351 * @param array $fileInfo
352 *
353 * @return array
354 */
355 protected function transformFromDriverFileInfoArrayToFileObjectFormat(array $fileInfo)
356 {
357 $mappingInfo = [
358 // 'driverKey' => 'fileProperty' Key is from the driver, value is for the property in the file
359 'size' => 'size',
360 'atime' => null,
361 'mtime' => 'modification_date',
362 'ctime' => 'creation_date',
363 'mimetype' => 'mime_type'
364 ];
365 $mappedFileInfo = [];
366 foreach ($fileInfo as $key => $value) {
367 if (array_key_exists($key, $mappingInfo)) {
368 if ($mappingInfo[$key] !== null) {
369 $mappedFileInfo[$mappingInfo[$key]] = $value;
370 }
371 } else {
372 $mappedFileInfo[$key] = $value;
373 }
374 }
375 return $mappedFileInfo;
376 }
377
378 /**
379 * Returns an instance of the FileIndexRepository
380 *
381 * @return FileIndexRepository
382 */
383 protected function getFileIndexRepository()
384 {
385 return FileIndexRepository::getInstance();
386 }
387
388 /**
389 * Returns an instance of the FileIndexRepository
390 *
391 * @return MetaDataRepository
392 */
393 protected function getMetaDataRepository()
394 {
395 return MetaDataRepository::getInstance();
396 }
397
398 /**
399 * Returns the ResourceFactory
400 *
401 * @return ResourceFactory
402 */
403 protected function getResourceFactory()
404 {
405 return ResourceFactory::getInstance();
406 }
407
408 /**
409 * Returns an instance of the FileIndexRepository
410 *
411 * @return ExtractorRegistry
412 */
413 protected function getExtractorRegistry()
414 {
415 return ExtractorRegistry::getInstance();
416 }
417 }