findMementos.php 6.3 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. <?php
  2. namespace OCA\Memento\Controller;
  3. use \DOMDocument;
  4. use \DateTime;
  5. use OCP\Share;
  6. use OCP\Files\FileInfo;
  7. // Finds HTML files that claim to be a snapshot of the given URL;
  8. // Returns an array of mementos, sorted by datetime, with each memento represented by an array:
  9. // [
  10. // 'mementoUrl' => URL of the file, relative to the nextcloud instance
  11. // 'originalUrls' => original URLs, usually just one.
  12. // 'datetime' => snapshot datetime as a unix timestamp
  13. // ]
  14. //
  15. // Each mementoUrl is hardcoded to /apps/raw/..., thus relying on the 'raw' app to serve the files.
  16. trait MementoFinder {
  17. function findSingleUserMementosForUrl($userId, $url) {
  18. // Get the user's public mementos.
  19. $foundMementos = findPublicMementos($this->serverContainer->getShareManager(), $userId);
  20. // If logged in, and asking for one's own mementos, get private mementos too.
  21. if ($this->loggedInUserId === $userId) {
  22. $userFolder = $this->serverContainer->getUserFolder($this->loggedInUserId);
  23. $moreMementos = findPrivateMementos($userFolder);
  24. $foundMementos = mergeMementos($foundMementos, $moreMementos);
  25. }
  26. // Filter those that match the requested URL, and sort them.
  27. $matchingMementos = filterMementosByUrl($foundMementos, $url);
  28. sortMementos($matchingMementos);
  29. return $matchingMementos;
  30. }
  31. function findAllUsersMementosForUrl($url) {
  32. $foundMementos = [];
  33. // Get the public mementos of every user.
  34. $allUserIds = [];
  35. $this->serverContainer->getUserManager()->callForAllUsers(
  36. function ($user) use (&$allUserIds) { $allUserIds[] = $user->getUID(); }
  37. );
  38. $shareManager = $this->serverContainer->getShareManager();
  39. foreach ($allUserIds as $userId) {
  40. $moreMementos = findPublicMementos($shareManager, $userId);
  41. $foundMementos = mergeMementos($foundMementos, $moreMementos);
  42. }
  43. // If logged in, get current user's private mementos too.
  44. if ($this->loggedInUserId) {
  45. $userFolder = $this->serverContainer->getUserFolder($this->loggedInUserId);
  46. $moreMementos = findPrivateMementos($userFolder);
  47. $foundMementos = mergeMementos($foundMementos, $moreMementos);
  48. }
  49. // Filter those that match the requested URL, and sort them.
  50. $matchingMementos = filterMementosByUrl($foundMementos, $url);
  51. $matchingMementos = sortMementos($matchingMementos);
  52. return $matchingMementos;
  53. }
  54. }
  55. function findPrivateMementos($folder) {
  56. $urlForFile = function ($file) use ($folder) {
  57. $absoluteFilePath = $file->getPath();
  58. $relativeFilePath = $folder->getRelativePath($absoluteFilePath);
  59. $rawFileUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency
  60. return $rawFileUrl;
  61. };
  62. // Peek into each HTML file the user owns, and return those that are mementos.
  63. $files = $folder->searchByMime('text/html');
  64. $foundMementos = [];
  65. foreach ($files as $file) {
  66. $mementoInfo = extractMementoInfo($file);
  67. if ($mementoInfo) {
  68. $mementoInfo['mementoUrl'] = $urlForFile($file);
  69. $foundMementos[] = $mementoInfo;
  70. }
  71. }
  72. return $foundMementos;
  73. }
  74. function findPublicMementos($shareManager, $userId) {
  75. $shares = $shareManager->getSharesBy(
  76. $userId,
  77. Share::SHARE_TYPE_LINK,
  78. null, /* path */
  79. true, /* include reshares */
  80. -1 /* no limit */
  81. );
  82. $urlForShare = function ($share) {
  83. return "/apps/raw/s/" . $share->getToken(); // XXX hardcoded dependency
  84. };
  85. // Look into every shared file to see if it is a memento.
  86. $foundMementos = [];
  87. foreach ($shares as $share) {
  88. $node = $share->getNode();
  89. if ($node->getType() === FileInfo::TYPE_FILE) {
  90. $mementoInfo = extractMementoInfo($node);
  91. if ($mementoInfo) {
  92. $mementoInfo['mementoUrl'] = $urlForShare($share);
  93. $foundMementos[] = $mementoInfo;
  94. }
  95. } else {
  96. // TODO add files inside shared folders? How to make URLs for those?
  97. }
  98. }
  99. return $foundMementos;
  100. }
  101. function mergeMementos($mementos1, $mementos2) {
  102. // TODO deduplicate (we'll get public & private URLs for the same files)
  103. return array_merge($mementos1, $mementos2);
  104. }
  105. function filterMementosByUrl($mementos, $url) {
  106. $matchingMementos = array_filter($mementos, function ($mementoInfo) use ($url) {
  107. return matchesUrl($mementoInfo, $url);
  108. });
  109. return $matchingMementos;
  110. }
  111. function matchesUrl($mementoInfo, $url) {
  112. $originalUrls = $mementoInfo['originalUrls'];
  113. foreach ($originalUrls as $originalUrl) {
  114. if (normaliseUrl($originalUrl) === normaliseUrl($url)) {
  115. return true;
  116. }
  117. }
  118. return false;
  119. }
  120. function normaliseUrl($url) {
  121. // Ignore trailing slashes. Because everybody does.
  122. $url = rtrim($url, '/');
  123. return $url;
  124. }
  125. // Sort an array of mementos by their datetime. Oldest first.
  126. function sortMementos($mementos) {
  127. usort($mementos, function ($m1, $m2) { return $m1['datetime'] <=> $m2['datetime']; });
  128. return $mementos;
  129. }
  130. function joinPaths($piece1, $piece2) {
  131. $left = rtrim($piece1, '/');
  132. $right = ltrim($piece2, '/');
  133. return "$left/$right";
  134. }
  135. function extractMementoInfo($file) {
  136. $content = $file->getContent();
  137. $DOM = new DOMDocument;
  138. $DOM->loadHTML($content);
  139. $headElement = $DOM->documentElement->getElementsByTagName('head')[0];
  140. if (!$headElement) return null; // possibly $content was not HTML at all.
  141. $originalUrls = getOriginalUrls($headElement);
  142. $datetime = getDatetime($headElement);
  143. return [
  144. 'originalUrls' => $originalUrls,
  145. 'datetime' => $datetime
  146. ];
  147. }
  148. // Reads hrefs from any <link> with relation type "original".
  149. // (note the plural: we also accept pages that claim to correspond to multiple original URLs)
  150. function getOriginalUrls($headElement) {
  151. $originalUrls = [];
  152. $links = $headElement->getElementsByTagName('link');
  153. foreach ($links as $link) {
  154. $rels = explode(' ', $link->getAttribute('rel'));
  155. if (in_array('original', $rels)) {
  156. $href = $link->getAttribute('href');
  157. $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
  158. if ($href) {
  159. $originalUrls[] = $href;
  160. }
  161. }
  162. }
  163. return $originalUrls;
  164. }
  165. // Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
  166. function getDatetime($headElement) {
  167. $metas = $headElement->getElementsByTagName('meta');
  168. foreach($metas as $meta) {
  169. // Let's match case-insensitively, I guess?
  170. if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
  171. $datetime = $meta->getAttribute('content');
  172. $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
  173. return $datetime; // Return directly at the first match
  174. }
  175. }
  176. return null;
  177. }