|
- <?php
- namespace OCA\Memento\Controller;
-
- use \DOMDocument;
- use \DateTime;
- use OCP\Share;
- use OCP\Files\FileInfo;
-
- // Finds HTML files that claim to be a snapshot of the given URL;
- // Returns an array of mementos, sorted by datetime, with each memento represented by an array:
- // [
- // 'mementoUrl' => URL of the file, relative to the nextcloud instance
- // 'originalUrls' => original URLs, usually just one.
- // 'datetime' => snapshot datetime as a unix timestamp
- // ]
- //
- // Each mementoUrl is hardcoded to /apps/raw/..., thus relying on the 'raw' app to serve the files.
-
- trait MementoFinder {
- function findSingleUserMementosForUrl($userId, $url) {
- // Get the user's public mementos.
- $foundMementos = findPublicMementos($this->serverContainer->getShareManager(), $userId);
-
- // If logged in, and asking for one's own mementos, get private mementos too.
- if ($this->loggedInUserId === $userId) {
- $userFolder = $this->serverContainer->getUserFolder($this->loggedInUserId);
- $moreMementos = findPrivateMementos($userFolder, $this->loggedInUserId);
- $foundMementos = array_merge($foundMementos, $moreMementos);
- }
-
- return listMementosMatchingUrl($foundMementos, $url);
- }
-
- function findAllUsersMementosForUrl($url) {
- $foundMementos = [];
-
- // Get the public mementos of every user.
- $allUserIds = [];
- $this->serverContainer->getUserManager()->callForAllUsers(
- function ($user) use (&$allUserIds) { $allUserIds[] = $user->getUID(); }
- );
- $shareManager = $this->serverContainer->getShareManager();
- foreach ($allUserIds as $userId) {
- $moreMementos = findPublicMementos($shareManager, $userId);
- $foundMementos = array_merge($foundMementos, $moreMementos);
- }
-
- // If logged in, get current user's private mementos too.
- if ($this->loggedInUserId) {
- $userFolder = $this->serverContainer->getUserFolder($this->loggedInUserId);
- $moreMementos = findPrivateMementos($userFolder, $this->loggedInUserId);
- $foundMementos = array_merge($foundMementos, $moreMementos);
- }
-
- return listMementosMatchingUrl($foundMementos, $url);
- }
- }
-
- function findPrivateMementos($userFolder, $userId) {
- $urlForFile = function ($file) use ($userFolder, $userId) {
- $absoluteFilePath = $file->getPath();
- $relativeFilePath = $userFolder->getRelativePath($absoluteFilePath);
- $rawFileUrl = joinPaths("/apps/raw/u/$userId", $relativeFilePath); // XXX hardcoded dependency
- return $rawFileUrl;
- };
-
- // Peek into each HTML file the user owns, and return those that are mementos.
- $files = $userFolder->searchByMime('text/html');
- $foundMementos = [];
- foreach ($files as $file) {
- $mementoInfo = extractMementoInfo($file);
- if ($mementoInfo) {
- $mementoInfo['mementoUrl'] = $urlForFile($file);
- $foundMementos[] = $mementoInfo;
- }
- }
- return $foundMementos;
- }
-
- function findPublicMementos($shareManager, $userId) {
- $shares = $shareManager->getSharesBy(
- $userId,
- Share::SHARE_TYPE_LINK,
- null, /* path */
- true, /* include reshares */
- -1 /* no limit */
- );
-
- $urlForSharedFile = function ($share) {
- return "/apps/raw/s/" . $share->getToken(); // XXX hardcoded dependency
- };
- $urlForFileInsideSharedFolder = function ($share, $folder, $file) {
- $absoluteFilePath = $file->getPath();
- $relativeFilePath = $folder->getRelativePath($absoluteFilePath);
- return joinPaths("/apps/raw/s/{$share->getToken()}", $relativeFilePath);
- };
-
- // Look into every shared file to see if it is a memento.
- $foundMementos = [];
- foreach ($shares as $share) {
- $node = $share->getNode();
- if ($node->getType() === FileInfo::TYPE_FILE) {
- $mementoInfo = extractMementoInfo($node);
- if ($mementoInfo) {
- $mementoInfo['mementoUrl'] = $urlForSharedFile($share);
- $foundMementos[] = $mementoInfo;
- }
- } else {
- // Share is a folder: Go through all html files inside the shared folder.
- $folder = $node;
- $files = $folder->searchByMime('text/html');
- foreach ($files as $file) {
- $mementoInfo = extractMementoInfo($file);
- if ($mementoInfo) {
- $mementoInfo['mementoUrl'] = $urlForFileInsideSharedFolder($share, $folder, $file);
- $foundMementos[] = $mementoInfo;
- }
- }
- }
- }
- return $foundMementos;
- }
-
- function listMementosMatchingUrl($foundMementos, $url) {
- // Filter those that match the requested URL
- $matchingMementos = filterMementosByUrl($foundMementos, $url);
- // Deduplicate (as a file may be accessible both through a public and a private URL)
- $matchingMementos = deduplicateMementos($matchingMementos);
- // Sort them by date.
- $matchingMementos = sortMementos($matchingMementos);
- return $matchingMementos;
- }
-
- function filterMementosByUrl($mementos, $url) {
- $matchingMementos = array_filter($mementos, function ($mementoInfo) use ($url) {
- return matchesUrl($mementoInfo, $url);
- });
- return $matchingMementos;
- }
-
- function matchesUrl($mementoInfo, $url) {
- $originalUrls = $mementoInfo['originalUrls'];
- foreach ($originalUrls as $originalUrl) {
- if (normaliseUrl($originalUrl) === normaliseUrl($url)) {
- return true;
- }
- }
- return false;
- }
-
- function normaliseUrl($url) {
- // Ignore trailing slashes. Because everybody does.
- $url = rtrim($url, '/');
- return $url;
- }
-
- function deduplicateMementos($mementos) {
- $deduped = [];
- $seenIds = [];
- foreach ($mementos as $memento) {
- if (!array_key_exists($memento['id'], $seenIds)) {
- $deduped[] = $memento;
- $seenIds[$memento['id']] = null;
- }
- }
- return $deduped;
- }
-
- // Sort an array of mementos by their datetime. Oldest first.
- function sortMementos($mementos) {
- usort($mementos, function ($m1, $m2) { return $m1['datetime'] <=> $m2['datetime']; });
- return $mementos;
- }
-
- function joinPaths($piece1, $piece2) {
- $left = rtrim($piece1, '/');
- $right = ltrim($piece2, '/');
- return "$left/$right";
- }
-
- function extractMementoInfo($file) {
- $content = $file->getContent();
- $DOM = new DOMDocument;
- $DOM->loadHTML($content);
- $headElement = $DOM->documentElement->getElementsByTagName('head')[0];
- if (!$headElement) return null; // possibly $content was not HTML at all.
- $originalUrls = getOriginalUrls($headElement);
- $datetime = getDatetime($headElement);
- return [
- 'originalUrls' => $originalUrls,
- 'datetime' => $datetime,
- 'id' => $file->getFileInfo()->getId() // for deduplication
- ];
- }
-
- // Reads hrefs from any <link> with relation type "original".
- // (note the plural: we also accept pages that claim to correspond to multiple original URLs)
- function getOriginalUrls($headElement) {
- $originalUrls = [];
- $links = $headElement->getElementsByTagName('link');
- foreach ($links as $link) {
- $rels = explode(' ', $link->getAttribute('rel'));
- if (in_array('original', $rels)) {
- $href = $link->getAttribute('href');
- $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
- if ($href) {
- $originalUrls[] = $href;
- }
- }
- }
- return $originalUrls;
- }
-
- // Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
- function getDatetime($headElement) {
- $metas = $headElement->getElementsByTagName('meta');
- foreach($metas as $meta) {
- // Let's match case-insensitively, I guess?
- if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
- $datetime = $meta->getAttribute('content');
- $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
- return $datetime; // Return directly at the first match
- }
- }
- return null;
- }
|