nextcloud-memento/lib/Controller/ findMementos.php
227 lines
7.1 KiB

  1. <?php
  2. namespace OCA\Memento\Controller;
  3. use \DOMDocument;
  4. use \DateTime;
  5. use OCP\Share;
  6. use OCP\Files\FileInfo;
  7. // Finds HTML files that claim to be a snapshot of the given URL;
  8. // Returns an array of mementos, sorted by datetime, with each memento represented by an array:
  9. // [
  10. // 'mementoUrl' => URL of the file, relative to the nextcloud instance
  11. // 'originalUrls' => original URLs, usually just one.
  12. // 'datetime' => snapshot datetime as a unix timestamp
  13. // ]
  14. //
  15. // Each mementoUrl is hardcoded to /apps/raw/..., thus relying on the 'raw' app to serve the files.
  16. trait MementoFinder {
  17. function findSingleUserMementosForUrl($userId, $url) {
  18. // Get the user's public mementos.
  19. $foundMementos = findPublicMementos($this->serverContainer->getShareManager(), $userId);
  20. // If logged in, and asking for one's own mementos, get private mementos too.
  21. if ($this->loggedInUserId === $userId) {
  22. $userFolder = $this->serverContainer->getUserFolder($this->loggedInUserId);
  23. $moreMementos = findPrivateMementos($userFolder);
  24. $foundMementos = array_merge($foundMementos, $moreMementos);
  25. }
  26. return listMementosMatchingUrl($foundMementos, $url);
  27. }
  28. function findAllUsersMementosForUrl($url) {
  29. $foundMementos = [];
  30. // Get the public mementos of every user.
  31. $allUserIds = [];
  32. $this->serverContainer->getUserManager()->callForAllUsers(
  33. function ($user) use (&$allUserIds) { $allUserIds[] = $user->getUID(); }
  34. );
  35. $shareManager = $this->serverContainer->getShareManager();
  36. foreach ($allUserIds as $userId) {
  37. $moreMementos = findPublicMementos($shareManager, $userId);
  38. $foundMementos = array_merge($foundMementos, $moreMementos);
  39. }
  40. // If logged in, get current user's private mementos too.
  41. if ($this->loggedInUserId) {
  42. $userFolder = $this->serverContainer->getUserFolder($this->loggedInUserId);
  43. $moreMementos = findPrivateMementos($userFolder);
  44. $foundMementos = array_merge($foundMementos, $moreMementos);
  45. }
  46. return listMementosMatchingUrl($foundMementos, $url);
  47. }
  48. }
  49. function findPrivateMementos($folder) {
  50. $urlForFile = function ($file) use ($folder) {
  51. $absoluteFilePath = $file->getPath();
  52. $relativeFilePath = $folder->getRelativePath($absoluteFilePath);
  53. $rawFileUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency
  54. return $rawFileUrl;
  55. };
  56. // Peek into each HTML file the user owns, and return those that are mementos.
  57. $files = $folder->searchByMime('text/html');
  58. $foundMementos = [];
  59. foreach ($files as $file) {
  60. $mementoInfo = extractMementoInfo($file);
  61. if ($mementoInfo) {
  62. $mementoInfo['mementoUrl'] = $urlForFile($file);
  63. $foundMementos[] = $mementoInfo;
  64. }
  65. }
  66. return $foundMementos;
  67. }
  68. function findPublicMementos($shareManager, $userId) {
  69. $shares = $shareManager->getSharesBy(
  70. $userId,
  71. Share::SHARE_TYPE_LINK,
  72. null, /* path */
  73. true, /* include reshares */
  74. -1 /* no limit */
  75. );
  76. $urlForSharedFile = function ($share) {
  77. return "/apps/raw/s/" . $share->getToken(); // XXX hardcoded dependency
  78. };
  79. $urlForFileInsideSharedFolder = function ($share, $folder, $file) {
  80. $absoluteFilePath = $file->getPath();
  81. $relativeFilePath = $folder->getRelativePath($absoluteFilePath);
  82. return joinPaths("/apps/raw/s/{$share->getToken()}", $relativeFilePath);
  83. };
  84. // Look into every shared file to see if it is a memento.
  85. $foundMementos = [];
  86. foreach ($shares as $share) {
  87. $node = $share->getNode();
  88. if ($node->getType() === FileInfo::TYPE_FILE) {
  89. $mementoInfo = extractMementoInfo($node);
  90. if ($mementoInfo) {
  91. $mementoInfo['mementoUrl'] = $urlForSharedFile($share);
  92. $foundMementos[] = $mementoInfo;
  93. }
  94. } else {
  95. // Share is a folder: Go through all html files inside the shared folder.
  96. $folder = $node;
  97. $files = $folder->searchByMime('text/html');
  98. foreach ($files as $file) {
  99. $mementoInfo = extractMementoInfo($file);
  100. if ($mementoInfo) {
  101. $mementoInfo['mementoUrl'] = $urlForFileInsideSharedFolder($share, $folder, $file);
  102. $foundMementos[] = $mementoInfo;
  103. }
  104. }
  105. }
  106. }
  107. return $foundMementos;
  108. }
  109. function listMementosMatchingUrl($foundMementos, $url) {
  110. // Filter those that match the requested URL
  111. $matchingMementos = filterMementosByUrl($foundMementos, $url);
  112. // Deduplicate (as a file may be accessible both through a public and a private URL)
  113. $matchingMementos = deduplicateMementos($matchingMementos);
  114. // Sort them by date.
  115. $matchingMementos = sortMementos($matchingMementos);
  116. return $matchingMementos;
  117. }
  118. function filterMementosByUrl($mementos, $url) {
  119. $matchingMementos = array_filter($mementos, function ($mementoInfo) use ($url) {
  120. return matchesUrl($mementoInfo, $url);
  121. });
  122. return $matchingMementos;
  123. }
  124. function matchesUrl($mementoInfo, $url) {
  125. $originalUrls = $mementoInfo['originalUrls'];
  126. foreach ($originalUrls as $originalUrl) {
  127. if (normaliseUrl($originalUrl) === normaliseUrl($url)) {
  128. return true;
  129. }
  130. }
  131. return false;
  132. }
  133. function normaliseUrl($url) {
  134. // Ignore trailing slashes. Because everybody does.
  135. $url = rtrim($url, '/');
  136. return $url;
  137. }
  138. function deduplicateMementos($mementos) {
  139. $deduped = [];
  140. $seenIds = [];
  141. foreach ($mementos as $memento) {
  142. if (!array_key_exists($memento['id'], $seenIds)) {
  143. $deduped[] = $memento;
  144. $seenIds[$memento['id']] = null;
  145. }
  146. }
  147. return $deduped;
  148. }
  149. // Sort an array of mementos by their datetime. Oldest first.
  150. function sortMementos($mementos) {
  151. usort($mementos, function ($m1, $m2) { return $m1['datetime'] <=> $m2['datetime']; });
  152. return $mementos;
  153. }
  154. function joinPaths($piece1, $piece2) {
  155. $left = rtrim($piece1, '/');
  156. $right = ltrim($piece2, '/');
  157. return "$left/$right";
  158. }
  159. function extractMementoInfo($file) {
  160. $content = $file->getContent();
  161. $DOM = new DOMDocument;
  162. $DOM->loadHTML($content);
  163. $headElement = $DOM->documentElement->getElementsByTagName('head')[0];
  164. if (!$headElement) return null; // possibly $content was not HTML at all.
  165. $originalUrls = getOriginalUrls($headElement);
  166. $datetime = getDatetime($headElement);
  167. return [
  168. 'originalUrls' => $originalUrls,
  169. 'datetime' => $datetime,
  170. 'id' => $file->getFileInfo()->getId() // for deduplication
  171. ];
  172. }
  173. // Reads hrefs from any <link> with relation type "original".
  174. // (note the plural: we also accept pages that claim to correspond to multiple original URLs)
  175. function getOriginalUrls($headElement) {
  176. $originalUrls = [];
  177. $links = $headElement->getElementsByTagName('link');
  178. foreach ($links as $link) {
  179. $rels = explode(' ', $link->getAttribute('rel'));
  180. if (in_array('original', $rels)) {
  181. $href = $link->getAttribute('href');
  182. $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
  183. if ($href) {
  184. $originalUrls[] = $href;
  185. }
  186. }
  187. }
  188. return $originalUrls;
  189. }
  190. // Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
  191. function getDatetime($headElement) {
  192. $metas = $headElement->getElementsByTagName('meta');
  193. foreach($metas as $meta) {
  194. // Let's match case-insensitively, I guess?
  195. if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
  196. $datetime = $meta->getAttribute('content');
  197. $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
  198. return $datetime; // Return directly at the first match
  199. }
  200. }
  201. return null;
  202. }