Commit 97506b1a authored by David Sveningsson's avatar David Sveningsson
Browse files

fix(lexer): handle unicode bom

parent 1a718990
......@@ -8,7 +8,7 @@ exports[`toBeToken() should fail if token doesn't match 1`] = `
Expected token to equal:
ObjectContaining {\\"type\\": \\"TAG_CLOSE\\"}
Received:
{\\"data\\": [\\"<foo\\", \\"\\", \\"foo\\"], \\"location\\": {\\"column\\": 2, \\"filename\\": \\"inline\\", \\"line\\": 1, \\"offset\\": 1, \\"size\\": 1}, \\"type\\": 6}
{\\"data\\": [\\"<foo\\", \\"\\", \\"foo\\"], \\"location\\": {\\"column\\": 2, \\"filename\\": \\"inline\\", \\"line\\": 1, \\"offset\\": 1, \\"size\\": 1}, \\"type\\": 7}
Difference:
......@@ -30,7 +30,7 @@ Difference:
+ \\"offset\\": 1,
+ \\"size\\": 1,
+ },
+ \\"type\\": 6,
+ \\"type\\": 7,
}"
`;
......
......@@ -154,6 +154,17 @@ describe("lexer", () => {
});
describe("should tokenize", () => {
it("unicode bom", () => {
expect.assertions(6);
const token = lexer.tokenize(inlineSource("\uFEFF<!DOCTYPE html>"));
expect(token.next()).toBeToken({ type: TokenType.UNICODE_BOM });
expect(token.next()).toBeToken({ type: TokenType.DOCTYPE_OPEN });
expect(token.next()).toBeToken({ type: TokenType.DOCTYPE_VALUE });
expect(token.next()).toBeToken({ type: TokenType.DOCTYPE_CLOSE });
expect(token.next()).toBeToken({ type: TokenType.EOF });
expect(token.next().done).toBeTruthy();
});
it("xml declaration", () => {
expect.assertions(2);
const token = lexer.tokenize(inlineSource('<?xml version="1.0" encoding="utf-8"?>\n'));
......
......@@ -6,6 +6,7 @@ type LexerTest = [RegExp | false, State | NextStateCallback, TokenType | false];
export type TokenStream = IterableIterator<Token>;
/* eslint-disable no-useless-escape */
const MATCH_UNICODE_BOM = /^\uFEFF/;
const MATCH_WHITESPACE = /^(?:\r\n|\r|\n|[ \t]+(?:\r\n|\r|\n)?)/;
const MATCH_DOCTYPE_OPEN = /^<!(DOCTYPE)\s/i;
const MATCH_DOCTYPE_VALUE = /^[^>]+/;
......@@ -173,6 +174,7 @@ export class Lexer {
yield* this.match(
context,
[
[MATCH_UNICODE_BOM, State.INITIAL, TokenType.UNICODE_BOM],
[MATCH_XML_TAG, State.INITIAL, false],
[MATCH_DOCTYPE_OPEN, State.DOCTYPE, TokenType.DOCTYPE_OPEN],
[MATCH_WHITESPACE, State.INITIAL, TokenType.WHITESPACE],
......
import { Location } from "../context";
export enum TokenType {
WHITESPACE = 1,
UNICODE_BOM = 1,
WHITESPACE,
NEWLINE,
DOCTYPE_OPEN,
DOCTYPE_VALUE,
......
......@@ -77,6 +77,10 @@ export class Parser {
const token = it.value;
switch (token.type) {
case TokenType.UNICODE_BOM:
/* ignore */
break;
case TokenType.TAG_OPEN:
this.consumeTag(source, token, tokenStream);
break;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment