nextcloud-memento/lib/Controller/ findMementos.php
215 lines
6.8 KiB

  1. <?php
  2. namespace OCA\Memento\Controller;
  3. use \DOMDocument;
  4. use \DateTime;
  5. use OCP\Share;
  6. use OCP\Files\FileInfo;
  7. // Finds HTML files that claim to be a snapshot of the given URL;
  8. // Returns an array of mementos, sorted by datetime, with each memento represented by an array:
  9. // [
  10. // 'mementoUrl' => URL of the file, relative to the nextcloud instance
  11. // 'originalUrls' => original URLs, usually just one.
  12. // 'datetime' => snapshot datetime as a unix timestamp
  13. // ]
  14. //
  15. // Each mementoUrl is hardcoded to /apps/raw/..., thus relying on the 'raw' app to serve the files.
  16. trait MementoFinder {
  17. function findSingleUserMementosForUrl($userId, $url) {
  18. // Get the user's public mementos.
  19. $foundMementos = findPublicMementos($this->serverContainer->getShareManager(), $userId);
  20. // If logged in, and asking for one's own mementos, get private mementos too.
  21. if ($this->loggedInUserId === $userId) {
  22. $userFolder = $this->serverContainer->getUserFolder($this->loggedInUserId);
  23. $moreMementos = findPrivateMementos($userFolder);
  24. $foundMementos = mergeMementos($foundMementos, $moreMementos);
  25. }
  26. // Filter those that match the requested URL, and sort them.
  27. $matchingMementos = filterMementosByUrl($foundMementos, $url);
  28. sortMementos($matchingMementos);
  29. return $matchingMementos;
  30. }
  31. function findAllUsersMementosForUrl($url) {
  32. $foundMementos = [];
  33. // Get the public mementos of every user.
  34. $allUserIds = [];
  35. $this->serverContainer->getUserManager()->callForAllUsers(
  36. function ($user) use (&$allUserIds) { $allUserIds[] = $user->getUID(); }
  37. );
  38. $shareManager = $this->serverContainer->getShareManager();
  39. foreach ($allUserIds as $userId) {
  40. $moreMementos = findPublicMementos($shareManager, $userId);
  41. $foundMementos = mergeMementos($foundMementos, $moreMementos);
  42. }
  43. // If logged in, get current user's private mementos too.
  44. if ($this->loggedInUserId) {
  45. $userFolder = $this->serverContainer->getUserFolder($this->loggedInUserId);
  46. $moreMementos = findPrivateMementos($userFolder);
  47. $foundMementos = mergeMementos($foundMementos, $moreMementos);
  48. }
  49. // Filter those that match the requested URL, and sort them.
  50. $matchingMementos = filterMementosByUrl($foundMementos, $url);
  51. $matchingMementos = sortMementos($matchingMementos);
  52. return $matchingMementos;
  53. }
  54. }
  55. function findPrivateMementos($folder) {
  56. $urlForFile = function ($file) use ($folder) {
  57. $absoluteFilePath = $file->getPath();
  58. $relativeFilePath = $folder->getRelativePath($absoluteFilePath);
  59. $rawFileUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency
  60. return $rawFileUrl;
  61. };
  62. // Peek into each HTML file the user owns, and return those that are mementos.
  63. $files = $folder->searchByMime('text/html');
  64. $foundMementos = [];
  65. foreach ($files as $file) {
  66. $mementoInfo = extractMementoInfo($file);
  67. if ($mementoInfo) {
  68. $mementoInfo['mementoUrl'] = $urlForFile($file);
  69. $foundMementos[] = $mementoInfo;
  70. }
  71. }
  72. return $foundMementos;
  73. }
  74. function findPublicMementos($shareManager, $userId) {
  75. $shares = $shareManager->getSharesBy(
  76. $userId,
  77. Share::SHARE_TYPE_LINK,
  78. null, /* path */
  79. true, /* include reshares */
  80. -1 /* no limit */
  81. );
  82. $urlForSharedFile = function ($share) {
  83. return "/apps/raw/s/" . $share->getToken(); // XXX hardcoded dependency
  84. };
  85. $urlForFileInsideSharedFolder = function ($share, $folder, $file) {
  86. $absoluteFilePath = $file->getPath();
  87. $relativeFilePath = $folder->getRelativePath($absoluteFilePath);
  88. return joinPaths("/apps/raw/s/{$share->getToken()}", $relativeFilePath);
  89. };
  90. // Look into every shared file to see if it is a memento.
  91. $foundMementos = [];
  92. foreach ($shares as $share) {
  93. $node = $share->getNode();
  94. if ($node->getType() === FileInfo::TYPE_FILE) {
  95. $mementoInfo = extractMementoInfo($node);
  96. if ($mementoInfo) {
  97. $mementoInfo['mementoUrl'] = $urlForSharedFile($share);
  98. $foundMementos[] = $mementoInfo;
  99. }
  100. } else {
  101. // Share is a folder: Go through all html files inside the shared folder.
  102. $folder = $node;
  103. $files = $folder->searchByMime('text/html');
  104. foreach ($files as $file) {
  105. $mementoInfo = extractMementoInfo($file);
  106. if ($mementoInfo) {
  107. $mementoInfo['mementoUrl'] = $urlForFileInsideSharedFolder($share, $folder, $file);
  108. $foundMementos[] = $mementoInfo;
  109. }
  110. }
  111. }
  112. }
  113. return $foundMementos;
  114. }
  115. function mergeMementos($mementos1, $mementos2) {
  116. // TODO deduplicate (we'll get public & private URLs for the same files)
  117. return array_merge($mementos1, $mementos2);
  118. }
  119. function filterMementosByUrl($mementos, $url) {
  120. $matchingMementos = array_filter($mementos, function ($mementoInfo) use ($url) {
  121. return matchesUrl($mementoInfo, $url);
  122. });
  123. return $matchingMementos;
  124. }
  125. function matchesUrl($mementoInfo, $url) {
  126. $originalUrls = $mementoInfo['originalUrls'];
  127. foreach ($originalUrls as $originalUrl) {
  128. if (normaliseUrl($originalUrl) === normaliseUrl($url)) {
  129. return true;
  130. }
  131. }
  132. return false;
  133. }
  134. function normaliseUrl($url) {
  135. // Ignore trailing slashes. Because everybody does.
  136. $url = rtrim($url, '/');
  137. return $url;
  138. }
  139. // Sort an array of mementos by their datetime. Oldest first.
  140. function sortMementos($mementos) {
  141. usort($mementos, function ($m1, $m2) { return $m1['datetime'] <=> $m2['datetime']; });
  142. return $mementos;
  143. }
  144. function joinPaths($piece1, $piece2) {
  145. $left = rtrim($piece1, '/');
  146. $right = ltrim($piece2, '/');
  147. return "$left/$right";
  148. }
  149. function extractMementoInfo($file) {
  150. $content = $file->getContent();
  151. $DOM = new DOMDocument;
  152. $DOM->loadHTML($content);
  153. $headElement = $DOM->documentElement->getElementsByTagName('head')[0];
  154. if (!$headElement) return null; // possibly $content was not HTML at all.
  155. $originalUrls = getOriginalUrls($headElement);
  156. $datetime = getDatetime($headElement);
  157. return [
  158. 'originalUrls' => $originalUrls,
  159. 'datetime' => $datetime
  160. ];
  161. }
  162. // Reads hrefs from any <link> with relation type "original".
  163. // (note the plural: we also accept pages that claim to correspond to multiple original URLs)
  164. function getOriginalUrls($headElement) {
  165. $originalUrls = [];
  166. $links = $headElement->getElementsByTagName('link');
  167. foreach ($links as $link) {
  168. $rels = explode(' ', $link->getAttribute('rel'));
  169. if (in_array('original', $rels)) {
  170. $href = $link->getAttribute('href');
  171. $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
  172. if ($href) {
  173. $originalUrls[] = $href;
  174. }
  175. }
  176. }
  177. return $originalUrls;
  178. }
  179. // Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
  180. function getDatetime($headElement) {
  181. $metas = $headElement->getElementsByTagName('meta');
  182. foreach($metas as $meta) {
  183. // Let's match case-insensitively, I guess?
  184. if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
  185. $datetime = $meta->getAttribute('content');
  186. $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
  187. return $datetime; // Return directly at the first match
  188. }
  189. }
  190. return null;
  191. }