Browse Source

Factor out snapshot search from timegate logic.

tags/v0.1.0
Gerben 1 year ago
parent
commit
b39a46f59b
2 changed files with 110 additions and 91 deletions
  1. +17
    -91
      lib/Controller/TimeGateController.php
  2. +93
    -0
      lib/Controller/findMementos.php

+ 17
- 91
lib/Controller/TimeGateController.php View File

@@ -1,7 +1,8 @@
<?php
namespace OCA\Memento\Controller;

use DOMDocument;
require_once __DIR__ . '/findMementos.php';

use DateTime;

use OCP\IRequest;
@@ -10,6 +11,8 @@ use OCP\AppFramework\Controller;
use OCP\AppFramework\Http\RedirectResponse;
use OCP\AppFramework\Http\DataDisplayResponse;

use findMementos;

class TimeGateController extends Controller {
private $userFolder;

@@ -29,42 +32,16 @@ class TimeGateController extends Controller {
* @NoCSRFRequired
*/
public function timeGate($url) {
// Get all HTML files the user owns.
$files = $this->userFolder->searchByMime('text/html');

// Filter them for pages that have a <link rel="original"> referring to the given URL.
$matchingFiles = array();
foreach ($files as $file) {
$content = $file->getContent();
try {
$DOM = new DOMDocument;
$DOM->loadHTML($content);
$head = $DOM->documentElement->getElementsByTagName('head')[0];
$originals = getOriginals($head);
foreach ($originals as $original) {
if (normaliseUrl($original) === normaliseUrl($url)) {
// Found a match!
$datetime = getDatetime($head);
$matchingFiles[] = [
'file' => $file,
'original' => $original,
'datetime' => $datetime
];
}
}
} catch (Exception $e) {
continue;
}
}
$matchingMementos = findMementos($this->userFolder, $url);

// Choose one of the matched files, if any.
if (count($matchingFiles) === 0) {
// Choose one of the matched mementos, if any.
if (count($matchingMementos) === 0) {
// No matches. :(
$message = "<h1>No snapshots found for requested URL. :(</h1>";
return new DataDisplayResponse($message, 404);
} else if (count($matchingFiles) === 1) {
} else if (count($matchingMementos) === 1) {
// One match; no need to choose.
$chosenFile = $matchingFiles[0];
$chosenMemento = $matchingMementos[0];
} else {
// Multiple matches: choose based on requested date.
$acceptDatetimeHeader = $this->request->getHeader('Accept-Datetime');
@@ -80,73 +57,22 @@ class TimeGateController extends Controller {
$requestedDatetime = time();
}
// Pick the one closest to the requested date (either before or after it).
$chosenFile = minBy($matchingFiles, function ($matchingFile) use ($requestedDatetime) {
return abs($matchingFile['datetime'] - $requestedDatetime);
});
$chosenMemento = minBy($matchingMementos,
function ($matchingMemento) use ($requestedDatetime) {
return abs($matchingMemento['datetime'] - $requestedDatetime);
}
);
}

// Send a 302 Found redirect pointing to the chosen file.
$absoluteFilePath = $chosenFile['file']->getPath();
$relativeFilePath = $this->userFolder->getRelativePath($absoluteFilePath);
$fileUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency
$originalUrl = $chosenFile['original'];
$response = new RedirectResponse($fileUrl);
// Send a 302 Found redirect pointing to the chosen memento.
$response = new RedirectResponse($chosenMemento['mementoUrl']);
$response->setStatus(302);
$response->addHeader('Vary', 'accept-datetime');
$response->addHeader('Link', "<$originalUrl>; rel=\"original\"");
$response->addHeader('Link', "<{$chosenMemento['originalUrl']}>; rel=\"original\"");
return $response;
}
}

function joinPaths($piece1, $piece2) {
$left = rtrim($piece1, '/');
$right = ltrim($piece2, '/');
return "$left/$right";
}

// Reads hrefs from any <link> with relation type "original".
// (note the plural: we also accept pages that claim to correspond to multiple original URLs)
function getOriginals($head) {
$originals = [];
$links = $head->getElementsByTagName('link');
foreach ($links as $link) {
$rels = explode(' ', $link->getAttribute('rel'));
if (in_array('original', $rels)) {
$href = $link->getAttribute('href');
$href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
if ($href) {
$originals[] = $href;
}
}
}
return $originals;
}

// Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
function getDatetime($head) {
$metas = $head->getElementsByTagName('meta');
foreach($metas as $meta) {
// Let's match case-insensitively, I guess?
if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
$datetime = $meta->getAttribute('content');
$datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
return $datetime; // Return directly at the first match
}
}
return null;
}

function normaliseUrl($url) {
// Ignore trailing slashes. Because everybody does.
$url = rtrim($url, '/');

// Replace multiple slashes with a single one. Because Nextcloud will have already done this to
// the queried url (e.g. 'http://abc' arrives to us as 'http:/abc')
$url = preg_replace('%/{2,}%', '/', $url);

return $url;
}

function minBy($array, $iteratee) {
// is there any simpler way for this in php?
$values = array_map($iteratee, $array);


+ 93
- 0
lib/Controller/findMementos.php View File

@@ -0,0 +1,93 @@
<?php
namespace OCA\Memento\Controller;

use DOMDocument;
use DateTime;

// Finds HTML files that claim to be a snapshot of the given URL;
// Returns an array, each item being a file's url + original url + snapshot datetime.
function findMementos($folder, $url) {
// Get all HTML files the user owns.
$files = $folder->searchByMime('text/html');

// Filter them for pages that have a <link rel="original"> referring to the given URL.
$matchingMementos = array();
foreach ($files as $file) {
$content = $file->getContent();
try {
$DOM = new DOMDocument;
$DOM->loadHTML($content);
$headElement = $DOM->documentElement->getElementsByTagName('head')[0];
$originalUrls = getOriginalUrls($headElement);
foreach ($originalUrls as $originalUrl) {
if (normaliseUrl($originalUrl) === normaliseUrl($url)) {
// Found a match!
// Read its datetime
$datetime = getDatetime($headElement);
// Construct its URL.
$absoluteFilePath = $file->getPath();
$relativeFilePath = $folder->getRelativePath($absoluteFilePath);
$mementoUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency

$matchingMementos[] = [
'mementoUrl' => $mementoUrl,
'originalUrl' => $originalUrl,
'datetime' => $datetime
];
}
}
} catch (Exception $e) {
continue;
}
}
return $matchingMementos;
}

function joinPaths($piece1, $piece2) {
$left = rtrim($piece1, '/');
$right = ltrim($piece2, '/');
return "$left/$right";
}

// Reads hrefs from any <link> with relation type "original".
// (note the plural: we also accept pages that claim to correspond to multiple original URLs)
function getOriginalUrls($headElement) {
$originalUrls = [];
$links = $headElement->getElementsByTagName('link');
foreach ($links as $link) {
$rels = explode(' ', $link->getAttribute('rel'));
if (in_array('original', $rels)) {
$href = $link->getAttribute('href');
$href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
if ($href) {
$originalUrls[] = $href;
}
}
}
return $originalUrls;
}

// Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
function getDatetime($headElement) {
$metas = $headElement->getElementsByTagName('meta');
foreach($metas as $meta) {
// Let's match case-insensitively, I guess?
if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
$datetime = $meta->getAttribute('content');
$datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
return $datetime; // Return directly at the first match
}
}
return null;
}

function normaliseUrl($url) {
// Ignore trailing slashes. Because everybody does.
$url = rtrim($url, '/');

// HACK. Replace multiple slashes with a single one. Because Nextcloud will have already done this
// to the queried url (e.g. 'http://abc' arrives to us as 'http:/abc').
$url = preg_replace('%/{2,}%', '/', $url);

return $url;
}

Loading…
Cancel
Save