searchByMime('text/html');
// Filter them for pages that have a referring to the given URL.
$matchingMementos = array();
foreach ($files as $file) {
$content = $file->getContent();
try {
$DOM = new DOMDocument;
$DOM->loadHTML($content);
$headElement = $DOM->documentElement->getElementsByTagName('head')[0];
$originalUrls = getOriginalUrls($headElement);
foreach ($originalUrls as $originalUrl) {
if (normaliseUrl($originalUrl) === normaliseUrl($url)) {
// Found a match!
// Read its datetime
$datetime = getDatetime($headElement);
// Construct its URL.
$absoluteFilePath = $file->getPath();
$relativeFilePath = $folder->getRelativePath($absoluteFilePath);
$mementoUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency
$matchingMementos[] = [
'mementoUrl' => $mementoUrl,
'originalUrl' => $originalUrl,
'datetime' => $datetime
];
}
}
} catch (Exception $e) {
continue;
}
}
return $matchingMementos;
}
function joinPaths($piece1, $piece2) {
$left = rtrim($piece1, '/');
$right = ltrim($piece2, '/');
return "$left/$right";
}
// Reads hrefs from any with relation type "original".
// (note the plural: we also accept pages that claim to correspond to multiple original URLs)
function getOriginalUrls($headElement) {
$originalUrls = [];
$links = $headElement->getElementsByTagName('link');
foreach ($links as $link) {
$rels = explode(' ', $link->getAttribute('rel'));
if (in_array('original', $rels)) {
$href = $link->getAttribute('href');
$href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
if ($href) {
$originalUrls[] = $href;
}
}
}
return $originalUrls;
}
// Read the content of the first , if any.
function getDatetime($headElement) {
$metas = $headElement->getElementsByTagName('meta');
foreach($metas as $meta) {
// Let's match case-insensitively, I guess?
if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
$datetime = $meta->getAttribute('content');
$datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
return $datetime; // Return directly at the first match
}
}
return null;
}
function normaliseUrl($url) {
// Ignore trailing slashes. Because everybody does.
$url = rtrim($url, '/');
// HACK. Replace multiple slashes with a single one. Because Nextcloud will have already done this
// to the queried url (e.g. 'http://abc' arrives to us as 'http:/abc').
$url = preg_replace('%/{2,}%', '/', $url);
return $url;
}