findMementos.php 3.1 KiB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. <?php
  2. namespace OCA\Memento\Controller;
  3. use DOMDocument;
  4. use DateTime;
  5. // Finds HTML files that claim to be a snapshot of the given URL;
  6. // Returns an array of each file's url + original url + snapshot datetime, sorted by datetime.
  7. function findMementos($folder, $url) {
  8. // Get all HTML files the user owns.
  9. $files = $folder->searchByMime('text/html');
  10. // Filter them for pages that have a <link rel="original"> referring to the given URL.
  11. $matchingMementos = array();
  12. foreach ($files as $file) {
  13. $content = $file->getContent();
  14. try {
  15. $DOM = new DOMDocument;
  16. $DOM->loadHTML($content);
  17. $headElement = $DOM->documentElement->getElementsByTagName('head')[0];
  18. $originalUrls = getOriginalUrls($headElement);
  19. foreach ($originalUrls as $originalUrl) {
  20. if (normaliseUrl($originalUrl) === normaliseUrl($url)) {
  21. // Found a match!
  22. // Read its datetime
  23. $datetime = getDatetime($headElement);
  24. // Construct its URL.
  25. $absoluteFilePath = $file->getPath();
  26. $relativeFilePath = $folder->getRelativePath($absoluteFilePath);
  27. $mementoUrl = joinPaths("/apps/raw/files", $relativeFilePath); // XXX hardcoded dependency
  28. $matchingMementos[] = [
  29. 'mementoUrl' => $mementoUrl,
  30. 'originalUrl' => $originalUrl,
  31. 'datetime' => $datetime
  32. ];
  33. }
  34. }
  35. } catch (Exception $e) {
  36. continue;
  37. }
  38. }
  39. // Sort mementos by their datetime. Oldest first.
  40. usort($matchingMementos, function ($m1, $m2) { return $m1['datetime'] <=> $m2['datetime']; });
  41. return $matchingMementos;
  42. }
  43. function joinPaths($piece1, $piece2) {
  44. $left = rtrim($piece1, '/');
  45. $right = ltrim($piece2, '/');
  46. return "$left/$right";
  47. }
  48. // Reads hrefs from any <link> with relation type "original".
  49. // (note the plural: we also accept pages that claim to correspond to multiple original URLs)
  50. function getOriginalUrls($headElement) {
  51. $originalUrls = [];
  52. $links = $headElement->getElementsByTagName('link');
  53. foreach ($links as $link) {
  54. $rels = explode(' ', $link->getAttribute('rel'));
  55. if (in_array('original', $rels)) {
  56. $href = $link->getAttribute('href');
  57. $href = filter_var($href, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
  58. if ($href) {
  59. $originalUrls[] = $href;
  60. }
  61. }
  62. }
  63. return $originalUrls;
  64. }
  65. // Read the content of the first <meta http-equiv="Memento-Datetime">, if any.
  66. function getDatetime($headElement) {
  67. $metas = $headElement->getElementsByTagName('meta');
  68. foreach($metas as $meta) {
  69. // Let's match case-insensitively, I guess?
  70. if (strtolower($meta->getAttribute('http-equiv')) === 'memento-datetime') {
  71. $datetime = $meta->getAttribute('content');
  72. $datetime = DateTime::createFromFormat(DateTime::RFC1123, $datetime)->getTimestamp();
  73. return $datetime; // Return directly at the first match
  74. }
  75. }
  76. return null;
  77. }
  78. function normaliseUrl($url) {
  79. // Ignore trailing slashes. Because everybody does.
  80. $url = rtrim($url, '/');
  81. // HACK. Replace multiple slashes with a single one. Because Nextcloud will have already done this
  82. // to the queried url (e.g. 'http://abc' arrives to us as 'http:/abc').
  83. $url = preg_replace('%/{2,}%', '/', $url);
  84. return $url;
  85. }