text-fragments-ts/src/ whatwg-html.ts
154 lines
7.6 KiB

  1. ///////////////////////////////////////////////
  2. /// Required pieces of the WHATWG HTML Spec ///
  3. ///////////////////////////////////////////////
  4. // Based on the version of 13 August 2020 <https://html.spec.whatwg.org/commit-snapshots/3c52fe139d9c637eb901932a77d743d6d5ecaa56/>
  5. import {
  6. integer,
  7. locale,
  8. isElement,
  9. nonEmptyString,
  10. } from './common.js';
  11. import {
  12. htmlNamespace,
  13. AsciiWhitespace,
  14. xmlNamespace,
  15. } from './whatwg-infra.js';
  16. // § 3.2.6.2 The lang and xml:lang attributes
  17. // https://html.spec.whatwg.org/multipage/dom.html#language
  18. export function languageOf(node: Node): locale {
  19. // “To determine the language of a node, user agents must look at the nearest ancestor element (including the element itself if the node is an element) that has a lang attribute in the XML namespace set or is an HTML element and has a lang in no namespace attribute set. That attribute specifies the language of the node (regardless of its value).”
  20. let curNode: Node | null = node;
  21. while (curNode !== null) {
  22. if (isElement(curNode)) {
  23. // “If both the lang attribute in no namespace and the lang attribute in the XML namespace are set on an element, user agents must use the lang attribute in the XML namespace, and the lang attribute in no namespace must be ignored for the purposes of determining the element's language.”
  24. const language = curNode.getAttributeNS(xmlNamespace, 'lang') ?? curNode.getAttributeNS(null, 'lang');
  25. if (language !== null)
  26. return language;
  27. }
  28. curNode = curNode.parentNode;
  29. }
  30. // “If node's inclusive ancestors do not have either attribute set, but there is a pragma-set default language set, then that is the language of the node.”
  31. const pragmaSetDefaultLanguage = getPragmaSetDefaultLanguage();
  32. if (pragmaSetDefaultLanguage !== undefined)
  33. return pragmaSetDefaultLanguage;
  34. // “If there is no pragma-set default language set, then language information from a higher-level protocol (such as HTTP), if any, must be used as the final fallback language instead.”
  35. // Probably not available to us. (well, perhaps we could try fetch document.URL from cache and read its headers…)
  36. // “In the absence of any such language information, and in cases where the higher-level protocol reports multiple languages, the language of the node is unknown, and the corresponding language tag is the empty string.”
  37. return '';
  38. }
  39. // § 4.2.5.3 Pragma directives
  40. // https://html.spec.whatwg.org/multipage/semantics.html#pragma-set-default-language
  41. // This implementation is a workaround, since we cannot read the pragma-set default language from the DOM. We simply rerun the steps the user agent should have executed to determine this value, when the corresponding <meta> elements are inserted into the document.
  42. // (note that we assume the meta elements were not modified after creation; in scenarios with attribute modifications our result could deviate from the correct result)
  43. export function getPragmaSetDefaultLanguage(): string | undefined {
  44. // “Content language state (http-equiv="content-language")”
  45. // “This pragma sets the pragma-set default language. Until such a pragma is successfully processed, there is no pragma-set default language.”
  46. let pragmaSetDefaultLanguage: string | undefined = undefined;
  47. const metaElements = document.querySelectorAll('meta[http-equiv="content-language"]');
  48. metaElements.forEach(element => {
  49. // 1. “If the meta element has no content attribute, then return.”
  50. if (element.hasAttribute('content'))
  51. return;
  52. // 3. “Let input be the value of the element's content attribute.”
  53. // (swapping the order for implementation simplicity)
  54. const input = element.getAttribute('content') as string;
  55. // 2. “If the element's content attribute contains a U+002C COMMA character (,) then return.”
  56. if (input.includes(','))
  57. return;
  58. // 4. “Let position point at the first character of input.”
  59. let position = 0;
  60. // 5. “Skip ASCII whitespace within input given position.”
  61. while (position < input.length && AsciiWhitespace.includes(input[position]))
  62. position++;
  63. // 6. “Collect a sequence of code points that are not ASCII whitespace from input given position.”
  64. // 7. “Let candidate be the string that resulted from the previous step.”
  65. let candidate = '';
  66. while (!AsciiWhitespace.includes(input[position])) {
  67. candidate += input[position];
  68. position++;
  69. }
  70. // 8. “If candidate is the empty string, return.”
  71. if (candidate === '')
  72. return;
  73. // 9. “Set the pragma-set default language to candidate.”
  74. pragmaSetDefaultLanguage = candidate;
  75. });
  76. return pragmaSetDefaultLanguage as string | undefined;
  77. }
  78. // § 7.5 Origin
  79. // https://html.spec.whatwg.org/multipage/origin.html#concept-origin
  80. // “An origin is one of the following:”
  81. export type origin = opaqueOrigin | tupleOrigin
  82. // https://html.spec.whatwg.org/multipage/origin.html#concept-origin-opaque
  83. // “An opaque origin”: “An internal value, with no serialization it can be recreated from (it is serialized as "null" per serialization of an origin), for which the only meaningful operation is testing for equality.”
  84. export type opaqueOrigin = symbol; // I guess?
  85. // https://html.spec.whatwg.org/multipage/origin.html#concept-origin-tuple
  86. // “A tuple origin”: “A tuple consists of:
  87. // • A scheme (a scheme).
  88. // • A host (a host).
  89. // • A port (a port).
  90. // • A domain (null or a domain). Null unless stated otherwise.”
  91. export type tupleOrigin = [
  92. // (using primitive types here; specifying these further is beyond scope)
  93. string,
  94. string | integer | integer[], // integers for IP addresses
  95. integer | null,
  96. nonEmptyString | null,
  97. ];
  98. // § 12.1.2 Elements
  99. // https://html.spec.whatwg.org/multipage/syntax.html#void-elements
  100. export const voidElements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'];
  101. // § 12.2 Parsing HTML documents
  102. // https://html.spec.whatwg.org/multipage/parsing.html#serializes-as-void
  103. // “For the purposes of the following algorithm, an element serializes as void if its element type is one of the void elements, or is basefont, bgsound, frame, or keygen.”
  104. export function serializesAsVoid(element: Element): boolean {
  105. // From § 2.1.3 XML Compatibility, <https://html.spec.whatwg.org/multipage/infrastructure.html#element-type>:
  106. // “The term element type is used to refer to the set of elements that have a given local name and namespace.”
  107. // “Except where otherwise stated, all elements defined or mentioned in this specification are in the HTML namespace ("http://www.w3.org/1999/xhtml")”
  108. if (element.namespaceURI === htmlNamespace
  109. && (voidElements.includes(element.localName) || ['basefont', 'bgsound', 'frame', 'keygen'].includes(element.localName))
  110. ) {
  111. return true;
  112. }
  113. return false;
  114. }
  115. // § 14.1 Rendering → Introduction
  116. // https://html.spec.whatwg.org/multipage/rendering.html#being-rendered
  117. // “An element is being rendered if it has any associated CSS layout boxes, SVG layout boxes, or some equivalent in other styling languages.”
  118. export function isBeingRendered(element: Element) {
  119. // “Note … The presence of the hidden attribute normally means the element is not being rendered, though this might be overridden by the style sheets.”
  120. // TODO figure out what exactly we should/could test.
  121. return !element.hasAttribute('hidden'); // TEMP
  122. }