nextcloud-memento/lib/Controller/ TimeGateController.php
156 lines
4.7 KiB

  1. <?php
  2. namespace OCA\Memento\Controller;
  3. use DOMDocument;
  4. use DateTime;
  5. use OCP\IRequest;
  6. use OCP\IServerContainer;
  7. use OCP\AppFramework\Controller;
  8. use OCP\AppFramework\Http\RedirectResponse;
  9. use OCP\AppFramework\Http\DataDisplayResponse;
  10. class TimeGateController extends Controller {
  11. private $userFolder;
  12. public function __construct(
  13. $AppName,
  14. IRequest $request,
  15. $UserId,
  16. IServerContainer $serverContainer
  17. ) {
  18. parent::__construct($AppName, $request);
  19. $this->userFolder = $serverContainer->getUserFolder($UserId);
  20. }
  21. /**
  22. * @PublicPage
  23. * @NoAdminRequired
  24. * @NoCSRFRequired
  25. */
  26. public function timeGate($url) {
  27. // Get all HTML files the user owns.
  28. $files = $this->userFolder->searchByMime('text/html');
  29. // Filter them for pages that have a <link rel="original"> referring to the given URL.
  30. $matchingFiles = array();
  31. foreach ($files as $file) {
  32. $content = $file->getContent();
  33. try {
  34. $DOM = new DOMDocument;
  35. $DOM->loadHTML($content);
  36. $head = $DOM->documentElement->getElementsByTagName('head')[0];
  37. $originals = getOriginals($head);
  38. foreach ($originals as $original) {
  39. if (normaliseUrl($original) === normaliseUrl($url)) {
  40. // Found a match!
  41. $datetime = getDatetime($head);
  42. $matchingFiles[] = [
  43. 'file' => $file,
  44. 'original' => $original,
  45. 'datetime' => $datetime
  46. ];
  47. }
  48. }
  49. } catch (Exception $e) {
  50. continue;
  51. }
  52. }
  53. // Choose one of the matched files, if any.
  54. if (count($matchingFiles) === 0) {
  55. // No matches. :(
  56. $message = "<h1>No snapshots found for requested URL. :(</h1>";
  57. return new DataDisplayResponse($message, 404);
  58. } else if (count($matchingFiles) === 1) {
  59. // One match; no need to choose.
  60. $chosenFile = $matchingFiles[0];
  61. } else {
  62. // Multiple matches: choose based on requested date.
  63. $acceptDatetimeHeader = $this->request->getHeader('Accept-Datetime');
  64. if ($acceptDatetimeHeader) {
  65. try {
  66. $requestedDatetime = DateTime::createFromFormat(DateTime::RFC1123, $acceptDatetimeHeader)
  67. ->getTimestamp();
  68. } catch (Exception $e) {
  69. return new DataDisplayResponse("Invalid Accept-Datetime header.", 400);
  70. }
  71. } else {
  72. // Not sending the header means requesting the most recent version.
  73. $requestedDatetime = time();
  74. }
  75. // Pick the one closest to the requested date (either before or after it).
  76. $chosenFile = minBy($matchingFiles, function ($matchingFile) use ($requestedDatetime) {
  77. return abs($matchingFile['datetime'] - $requestedDatetime);
  78. });
  79. }
  80. // Send a 302 Found redirect pointing to the chosen file.
  81. $absoluteFilePath = $chosenFile['file']->getPath();
  82. $relativeFilePath = $this->userFolder->getRelativePath($absoluteFilePath);
  83. $fileUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency
  84. $originalUrl = $chosenFile['original'];
  85. $response = new RedirectResponse($fileUrl);
  86. $response->setStatus(302);
  87. $response->addHeader('Vary', 'accept-datetime');
  88. $response->addHeader('Link', "<$originalUrl>; rel=\"original\"");
  89. return $response;
  90. }
  91. }
  92. function joinPaths($piece1, $piece2) {
  93. $left = rtrim($piece1, '/');
  94. $right = ltrim($piece2, '/');
  95. return "$left/$right";
  96. }
  97. // Reads hrefs from any <link> with relation type "original".
  98. // (note the plural: we also accept pages that claim to correspond to multiple original URLs)
  99. function getOriginals($head) {
  100. $originals = [];
  101. $links = $head->getElementsByTagName('link');
  102. foreach ($links as $link) {
  103. $rels = explode(' ', $link->getAttribute('rel'));
  104. if (in_array('original', $rels)) {
  105. $href = $link->getAttribute('href');
  106. $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
  107. if ($href) {
  108. $originals[] = $href;
  109. }
  110. }
  111. }
  112. return $originals;
  113. }
  114. // Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
  115. function getDatetime($head) {
  116. $metas = $head->getElementsByTagName('meta');
  117. foreach($metas as $meta) {
  118. // Let's match case-insensitively, I guess?
  119. if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
  120. $datetime = $meta->getAttribute('content');
  121. $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
  122. return $datetime; // Return directly at the first match
  123. }
  124. }
  125. return null;
  126. }
  127. function normaliseUrl($url) {
  128. // Ignore trailing slashes. Because everybody does.
  129. $url = rtrim($url, '/');
  130. // Replace multiple slashes with a single one. Because Nextcloud will have already done this to
  131. // the queried url (e.g. 'http://abc' arrives to us as 'http:/abc')
  132. $url = preg_replace('%/{2,}%', '/', $url);
  133. return $url;
  134. }
  135. function minBy($array, $iteratee) {
  136. // is there any simpler way for this in php?
  137. $values = array_map($iteratee, $array);
  138. $argmin = array_search(min($values), $values);
  139. return $array[$argmin];
  140. }