searchByMime('text/html'); // Filter them for pages that have a referring to the given URL. $matchingMementos = array(); foreach ($files as $file) { $content = $file->getContent(); try { $DOM = new DOMDocument; $DOM->loadHTML($content); $headElement = $DOM->documentElement->getElementsByTagName('head')[0]; $originalUrls = getOriginalUrls($headElement); foreach ($originalUrls as $originalUrl) { if (normaliseUrl($originalUrl) === normaliseUrl($url)) { // Found a match! // Read its datetime $datetime = getDatetime($headElement); // Construct its URL. $absoluteFilePath = $file->getPath(); $relativeFilePath = $folder->getRelativePath($absoluteFilePath); $mementoUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency $matchingMementos[] = [ 'mementoUrl' => $mementoUrl, 'originalUrl' => $originalUrl, 'datetime' => $datetime ]; } } } catch (Exception $e) { continue; } } return $matchingMementos; } function joinPaths($piece1, $piece2) { $left = rtrim($piece1, '/'); $right = ltrim($piece2, '/'); return "$left/$right"; } // Reads hrefs from any with relation type "original". // (note the plural: we also accept pages that claim to correspond to multiple original URLs) function getOriginalUrls($headElement) { $originalUrls = []; $links = $headElement->getElementsByTagName('link'); foreach ($links as $link) { $rels = explode(' ', $link->getAttribute('rel')); if (in_array('original', $rels)) { $href = $link->getAttribute('href'); $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); if ($href) { $originalUrls[] = $href; } } } return $originalUrls; } // Read the content of the first , if any. function getDatetime($headElement) { $metas = $headElement->getElementsByTagName('meta'); foreach($metas as $meta) { // Let's match case-insensitively, I guess? if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') { $datetime = $meta->getAttribute('content'); $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp(); return $datetime; // Return directly at the first match } } return null; } function normaliseUrl($url) { // Ignore trailing slashes. Because everybody does. $url = rtrim($url, '/'); // HACK. Replace multiple slashes with a single one. Because Nextcloud will have already done this // to the queried url (e.g. 'http://abc' arrives to us as 'http:/abc'). $url = preg_replace('%/{2,}%', '/', $url); return $url; }