findMementos.php 3.3 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. <?php
  2. use DOMDocument;
  3. use DateTime;
  4. // Finds HTML files that claim to be a snapshot of the given URL;
  5. // Returns an array of mementos, sorted by datetime, with each memento represented by an array:
  6. // [
  7. // 'mementoUrl' => URL of the file, relative to the nextcloud instance
  8. // 'originalUrl' => original URL, presumably equal to the given $url, except we normalise a bit
  9. // 'datetime' => snapshot datetime as a unix timestamp
  10. // ]
  11. function findMementos($folder, $url) {
  12. // Get all HTML files the user owns.
  13. $files = $folder->searchByMime('text/html');
  14. // Filter them for pages that have a <link rel="original"> referring to the given URL.
  15. $matchingMementos = array();
  16. foreach ($files as $file) {
  17. $content = $file->getContent();
  18. try {
  19. $DOM = new DOMDocument;
  20. $DOM->loadHTML($content);
  21. $headElement = $DOM->documentElement->getElementsByTagName('head')[0];
  22. $originalUrls = getOriginalUrls($headElement);
  23. foreach ($originalUrls as $originalUrl) {
  24. if (normaliseUrl($originalUrl) === normaliseUrl($url)) {
  25. // Found a match!
  26. // Read its datetime
  27. $datetime = getDatetime($headElement);
  28. // Construct its URL.
  29. $absoluteFilePath = $file->getPath();
  30. $relativeFilePath = $folder->getRelativePath($absoluteFilePath);
  31. $mementoUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency
  32. $matchingMementos[] = [
  33. 'mementoUrl' => $mementoUrl,
  34. 'originalUrl' => $originalUrl,
  35. 'datetime' => $datetime
  36. ];
  37. }
  38. }
  39. } catch (Exception $e) {
  40. continue;
  41. }
  42. }
  43. // Sort mementos by their datetime. Oldest first.
  44. usort($matchingMementos, function ($m1, $m2) { return $m1['datetime'] <=> $m2['datetime']; });
  45. return $matchingMementos;
  46. }
  47. function joinPaths($piece1, $piece2) {
  48. $left = rtrim($piece1, '/');
  49. $right = ltrim($piece2, '/');
  50. return "$left/$right";
  51. }
  52. // Reads hrefs from any <link> with relation type "original".
  53. // (note the plural: we also accept pages that claim to correspond to multiple original URLs)
  54. function getOriginalUrls($headElement) {
  55. $originalUrls = [];
  56. $links = $headElement->getElementsByTagName('link');
  57. foreach ($links as $link) {
  58. $rels = explode(' ', $link->getAttribute('rel'));
  59. if (in_array('original', $rels)) {
  60. $href = $link->getAttribute('href');
  61. $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
  62. if ($href) {
  63. $originalUrls[] = $href;
  64. }
  65. }
  66. }
  67. return $originalUrls;
  68. }
  69. // Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
  70. function getDatetime($headElement) {
  71. $metas = $headElement->getElementsByTagName('meta');
  72. foreach($metas as $meta) {
  73. // Let's match case-insensitively, I guess?
  74. if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
  75. $datetime = $meta->getAttribute('content');
  76. $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
  77. return $datetime; // Return directly at the first match
  78. }
  79. }
  80. return null;
  81. }
  82. function normaliseUrl($url) {
  83. // Ignore trailing slashes. Because everybody does.
  84. $url = rtrim($url, '/');
  85. // HACK. Replace multiple slashes with a single one. Because Nextcloud will have already done this
  86. // to the queried url (e.g. 'http://abc' arrives to us as 'http:/abc').
  87. $url = preg_replace('%/{2,}%', '/', $url);
  88. return $url;
  89. }