|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- <?php
- namespace OCA\Memento\Controller;
-
- use DOMDocument;
- use DateTime;
-
- // Finds HTML files that claim to be a snapshot of the given URL;
- // Returns an array of each file's url + original url + snapshot datetime, sorted by datetime.
- function findMementos($folder, $url) {
- // Get all HTML files the user owns.
- $files = $folder->searchByMime('text/html');
-
- // Filter them for pages that have a <link rel="original"> referring to the given URL.
- $matchingMementos = array();
- foreach ($files as $file) {
- $content = $file->getContent();
- try {
- $DOM = new DOMDocument;
- $DOM->loadHTML($content);
- $headElement = $DOM->documentElement->getElementsByTagName('head')[0];
- $originalUrls = getOriginalUrls($headElement);
- foreach ($originalUrls as $originalUrl) {
- if (normaliseUrl($originalUrl) === normaliseUrl($url)) {
- // Found a match!
- // Read its datetime
- $datetime = getDatetime($headElement);
- // Construct its URL.
- $absoluteFilePath = $file->getPath();
- $relativeFilePath = $folder->getRelativePath($absoluteFilePath);
- $mementoUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency
-
- $matchingMementos[] = [
- 'mementoUrl' => $mementoUrl,
- 'originalUrl' => $originalUrl,
- 'datetime' => $datetime
- ];
- }
- }
- } catch (Exception $e) {
- continue;
- }
- }
-
- // Sort mementos by their datetime. Oldest first.
- usort($matchingMementos, function ($m1, $m2) { return $m1['datetime'] <=> $m2['datetime']; });
-
- return $matchingMementos;
- }
-
- function joinPaths($piece1, $piece2) {
- $left = rtrim($piece1, '/');
- $right = ltrim($piece2, '/');
- return "$left/$right";
- }
-
- // Reads hrefs from any <link> with relation type "original".
- // (note the plural: we also accept pages that claim to correspond to multiple original URLs)
- function getOriginalUrls($headElement) {
- $originalUrls = [];
- $links = $headElement->getElementsByTagName('link');
- foreach ($links as $link) {
- $rels = explode(' ', $link->getAttribute('rel'));
- if (in_array('original', $rels)) {
- $href = $link->getAttribute('href');
- $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
- if ($href) {
- $originalUrls[] = $href;
- }
- }
- }
- return $originalUrls;
- }
-
- // Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
- function getDatetime($headElement) {
- $metas = $headElement->getElementsByTagName('meta');
- foreach($metas as $meta) {
- // Let's match case-insensitively, I guess?
- if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
- $datetime = $meta->getAttribute('content');
- $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
- return $datetime; // Return directly at the first match
- }
- }
- return null;
- }
-
- function normaliseUrl($url) {
- // Ignore trailing slashes. Because everybody does.
- $url = rtrim($url, '/');
-
- // HACK. Replace multiple slashes with a single one. Because Nextcloud will have already done this
- // to the queried url (e.g. 'http://abc' arrives to us as 'http:/abc').
- $url = preg_replace('%/{2,}%', '/', $url);
-
- return $url;
- }
|