From b39a46f59b9c8926a87ac5ebb008f48946858ef7 Mon Sep 17 00:00:00 2001 From: Gerben Date: Mon, 3 Sep 2018 14:09:59 +0200 Subject: [PATCH] Factor out snapshot search from timegate logic. --- lib/Controller/TimeGateController.php | 108 ++++---------------------- lib/Controller/findMementos.php | 93 ++++++++++++++++++++++ 2 files changed, 110 insertions(+), 91 deletions(-) create mode 100644 lib/Controller/findMementos.php diff --git a/lib/Controller/TimeGateController.php b/lib/Controller/TimeGateController.php index 394db7c..49842d7 100644 --- a/lib/Controller/TimeGateController.php +++ b/lib/Controller/TimeGateController.php @@ -1,7 +1,8 @@ userFolder->searchByMime('text/html'); - - // Filter them for pages that have a referring to the given URL. - $matchingFiles = array(); - foreach ($files as $file) { - $content = $file->getContent(); - try { - $DOM = new DOMDocument; - $DOM->loadHTML($content); - $head = $DOM->documentElement->getElementsByTagName('head')[0]; - $originals = getOriginals($head); - foreach ($originals as $original) { - if (normaliseUrl($original) === normaliseUrl($url)) { - // Found a match! - $datetime = getDatetime($head); - $matchingFiles[] = [ - 'file' => $file, - 'original' => $original, - 'datetime' => $datetime - ]; - } - } - } catch (Exception $e) { - continue; - } - } + $matchingMementos = findMementos($this->userFolder, $url); - // Choose one of the matched files, if any. - if (count($matchingFiles) === 0) { + // Choose one of the matched mementos, if any. + if (count($matchingMementos) === 0) { // No matches. :( $message = "

No snapshots found for requested URL. :(

"; return new DataDisplayResponse($message, 404); - } else if (count($matchingFiles) === 1) { + } else if (count($matchingMementos) === 1) { // One match; no need to choose. - $chosenFile = $matchingFiles[0]; + $chosenMemento = $matchingMementos[0]; } else { // Multiple matches: choose based on requested date. $acceptDatetimeHeader = $this->request->getHeader('Accept-Datetime'); @@ -80,73 +57,22 @@ class TimeGateController extends Controller { $requestedDatetime = time(); } // Pick the one closest to the requested date (either before or after it). - $chosenFile = minBy($matchingFiles, function ($matchingFile) use ($requestedDatetime) { - return abs($matchingFile['datetime'] - $requestedDatetime); - }); + $chosenMemento = minBy($matchingMementos, + function ($matchingMemento) use ($requestedDatetime) { + return abs($matchingMemento['datetime'] - $requestedDatetime); + } + ); } - // Send a 302 Found redirect pointing to the chosen file. - $absoluteFilePath = $chosenFile['file']->getPath(); - $relativeFilePath = $this->userFolder->getRelativePath($absoluteFilePath); - $fileUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency - $originalUrl = $chosenFile['original']; - $response = new RedirectResponse($fileUrl); + // Send a 302 Found redirect pointing to the chosen memento. + $response = new RedirectResponse($chosenMemento['mementoUrl']); $response->setStatus(302); $response->addHeader('Vary', 'accept-datetime'); - $response->addHeader('Link', "<$originalUrl>; rel=\"original\""); + $response->addHeader('Link', "<{$chosenMemento['originalUrl']}>; rel=\"original\""); return $response; } } -function joinPaths($piece1, $piece2) { - $left = rtrim($piece1, '/'); - $right = ltrim($piece2, '/'); - return "$left/$right"; -} - -// Reads hrefs from any with relation type "original". -// (note the plural: we also accept pages that claim to correspond to multiple original URLs) -function getOriginals($head) { - $originals = []; - $links = $head->getElementsByTagName('link'); - foreach ($links as $link) { - $rels = explode(' ', $link->getAttribute('rel')); - if (in_array('original', $rels)) { - $href = $link->getAttribute('href'); - $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); - if ($href) { - $originals[] = $href; - } - } - } - return $originals; -} - -// Read the content of the first , if any. -function getDatetime($head) { - $metas = $head->getElementsByTagName('meta'); - foreach($metas as $meta) { - // Let's match case-insensitively, I guess? - if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') { - $datetime = $meta->getAttribute('content'); - $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp(); - return $datetime; // Return directly at the first match - } - } - return null; -} - -function normaliseUrl($url) { - // Ignore trailing slashes. Because everybody does. - $url = rtrim($url, '/'); - - // Replace multiple slashes with a single one. Because Nextcloud will have already done this to - // the queried url (e.g. 'http://abc' arrives to us as 'http:/abc') - $url = preg_replace('%/{2,}%', '/', $url); - - return $url; -} - function minBy($array, $iteratee) { // is there any simpler way for this in php? $values = array_map($iteratee, $array); diff --git a/lib/Controller/findMementos.php b/lib/Controller/findMementos.php new file mode 100644 index 0000000..3a104dd --- /dev/null +++ b/lib/Controller/findMementos.php @@ -0,0 +1,93 @@ +searchByMime('text/html'); + + // Filter them for pages that have a referring to the given URL. + $matchingMementos = array(); + foreach ($files as $file) { + $content = $file->getContent(); + try { + $DOM = new DOMDocument; + $DOM->loadHTML($content); + $headElement = $DOM->documentElement->getElementsByTagName('head')[0]; + $originalUrls = getOriginalUrls($headElement); + foreach ($originalUrls as $originalUrl) { + if (normaliseUrl($originalUrl) === normaliseUrl($url)) { + // Found a match! + // Read its datetime + $datetime = getDatetime($headElement); + // Construct its URL. + $absoluteFilePath = $file->getPath(); + $relativeFilePath = $folder->getRelativePath($absoluteFilePath); + $mementoUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency + + $matchingMementos[] = [ + 'mementoUrl' => $mementoUrl, + 'originalUrl' => $originalUrl, + 'datetime' => $datetime + ]; + } + } + } catch (Exception $e) { + continue; + } + } + return $matchingMementos; +} + +function joinPaths($piece1, $piece2) { + $left = rtrim($piece1, '/'); + $right = ltrim($piece2, '/'); + return "$left/$right"; +} + +// Reads hrefs from any with relation type "original". +// (note the plural: we also accept pages that claim to correspond to multiple original URLs) +function getOriginalUrls($headElement) { + $originalUrls = []; + $links = $headElement->getElementsByTagName('link'); + foreach ($links as $link) { + $rels = explode(' ', $link->getAttribute('rel')); + if (in_array('original', $rels)) { + $href = $link->getAttribute('href'); + $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); + if ($href) { + $originalUrls[] = $href; + } + } + } + return $originalUrls; +} + +// Read the content of the first , if any. +function getDatetime($headElement) { + $metas = $headElement->getElementsByTagName('meta'); + foreach($metas as $meta) { + // Let's match case-insensitively, I guess? + if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') { + $datetime = $meta->getAttribute('content'); + $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp(); + return $datetime; // Return directly at the first match + } + } + return null; +} + +function normaliseUrl($url) { + // Ignore trailing slashes. Because everybody does. + $url = rtrim($url, '/'); + + // HACK. Replace multiple slashes with a single one. Because Nextcloud will have already done this + // to the queried url (e.g. 'http://abc' arrives to us as 'http:/abc'). + $url = preg_replace('%/{2,}%', '/', $url); + + return $url; +}