summaryrefslogtreecommitdiffstats
path: root/packages/excalidraw/element/textWrapping.test.ts
diff options
context:
space:
mode:
Diffstat (limited to 'packages/excalidraw/element/textWrapping.test.ts')
-rw-r--r--packages/excalidraw/element/textWrapping.test.ts633
1 files changed, 633 insertions, 0 deletions
diff --git a/packages/excalidraw/element/textWrapping.test.ts b/packages/excalidraw/element/textWrapping.test.ts
new file mode 100644
index 0000000..6c7bcb8
--- /dev/null
+++ b/packages/excalidraw/element/textWrapping.test.ts
@@ -0,0 +1,633 @@
+import { wrapText, parseTokens } from "./textWrapping";
+import type { FontString } from "./types";
+
+describe("Test wrapText", () => {
+ // font is irrelevant as jsdom does not support FontFace API
+ // `measureText` width is mocked to return `text.length` by `jest-canvas-mock`
+ // https://github.com/hustcc/jest-canvas-mock/blob/master/src/classes/TextMetrics.js
+ const font = "10px Cascadia, Segoe UI Emoji" as FontString;
+
+ it("should wrap the text correctly when word length is exactly equal to max width", () => {
+ const text = "Hello Excalidraw";
+ // Length of "Excalidraw" is 100 and exacty equal to max width
+ const res = wrapText(text, font, 100);
+ expect(res).toEqual(`Hello\nExcalidraw`);
+ });
+
+ it("should return the text as is if max width is invalid", () => {
+ const text = "Hello Excalidraw";
+ expect(wrapText(text, font, NaN)).toEqual(text);
+ expect(wrapText(text, font, -1)).toEqual(text);
+ expect(wrapText(text, font, Infinity)).toEqual(text);
+ });
+
+ it("should show the text correctly when max width reached", () => {
+ const text = "HelloπŸ˜€";
+ const maxWidth = 10;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("H\ne\nl\nl\no\nπŸ˜€");
+ });
+
+ it("should not wrap number when wrapping line", () => {
+ const text = "don't wrap this number 99,100.99";
+ const maxWidth = 300;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("don't wrap this number\n99,100.99");
+ });
+
+ it("should trim all trailing whitespaces", () => {
+ const text = "Hello ";
+ const maxWidth = 50;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("Hello");
+ });
+
+ it("should trim all but one trailing whitespaces", () => {
+ const text = "Hello ";
+ const maxWidth = 60;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("Hello ");
+ });
+
+ it("should keep preceding whitespaces and trim all trailing whitespaces", () => {
+ const text = " Hello World";
+ const maxWidth = 90;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe(" Hello\nWorld");
+ });
+
+ it("should keep some preceding whitespaces, trim trailing whitespaces, but kep those that fit in the trailing line", () => {
+ const text = " Hello World ";
+ const maxWidth = 90;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe(" Hello\nWorld ");
+ });
+
+ it("should trim keep those whitespace that fit in the trailing line", () => {
+ const text = "Hello Wo rl d ";
+ const maxWidth = 100;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("Hello Wo\nrl d ");
+ });
+
+ it("should support multiple (multi-codepoint) emojis", () => {
+ const text = "πŸ˜€πŸ—ΊπŸ”₯πŸ‘©πŸ½β€πŸ¦°πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦πŸ‡¨πŸ‡Ώ";
+ const maxWidth = 1;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("πŸ˜€\nπŸ—Ί\nπŸ”₯\nπŸ‘©πŸ½β€πŸ¦°\nπŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦\nπŸ‡¨πŸ‡Ώ");
+ });
+
+ it("should wrap the text correctly when text contains hyphen", () => {
+ let text =
+ "Wikipedia is hosted by Wikimedia- Foundation, a non-profit organization that also hosts a range-of other projects";
+ const res = wrapText(text, font, 110);
+ expect(res).toBe(
+ `Wikipedia\nis hosted\nby\nWikimedia-\nFoundation,\na non-\nprofit\norganizatio\nn that also\nhosts a\nrange-of\nother\nprojects`,
+ );
+
+ text = "Hello thereusing-now";
+ expect(wrapText(text, font, 100)).toEqual("Hello\nthereusing\n-now");
+ });
+
+ it("should support wrapping nested lists", () => {
+ const text = `\tA) one tab\t\t- two tabs - 8 spaces`;
+
+ const maxWidth = 100;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe(`\tA) one\ntab\t\t- two\ntabs\n- 8 spaces`);
+
+ const maxWidth2 = 50;
+ const res2 = wrapText(text, font, maxWidth2);
+ expect(res2).toBe(`\tA)\none\ntab\n- two\ntabs\n- 8\nspace\ns`);
+ });
+
+ describe("When text is CJK", () => {
+ it("should break each CJK character when width is very small", () => {
+ // "μ•ˆλ…•ν•˜μ„Έμš”" (Hangul) + "γ“γ‚“γ«γ‘γ―δΈ–η•Œ" (Hiragana, Kanji) + "コンニチハ" (Katakana) + "δ½ ε₯½" (Han) = "Hello Hello World Hello Hi"
+ const text = "μ•ˆλ…•ν•˜μ„Έμš”γ“γ‚“γ«γ‘γ―δΈ–η•Œο½ΊοΎοΎ†οΎγƒδ½ ε₯½";
+ const maxWidth = 10;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe(
+ "μ•ˆ\nλ…•\nν•˜\nμ„Έ\nμš”\nこ\nγ‚“\nに\nけ\nは\nδΈ–\nη•Œ\nο½Ί\nン\nοΎ†\nチ\nハ\nδ½ \nε₯½",
+ );
+ });
+
+ it("should break CJK text into longer segments when width is larger", () => {
+ // "μ•ˆλ…•ν•˜μ„Έμš”" (Hangul) + "γ“γ‚“γ«γ‘γ―δΈ–η•Œ" (Hiragana, Kanji) + "コンニチハ" (Katakana) + "δ½ ε₯½" (Han) = "Hello Hello World Hello Hi"
+ const text = "μ•ˆλ…•ν•˜μ„Έμš”γ“γ‚“γ«γ‘γ―δΈ–η•Œο½ΊοΎοΎ†οΎγƒδ½ ε₯½";
+ const maxWidth = 30;
+ const res = wrapText(text, font, maxWidth);
+
+ // measureText is mocked, so it's not precisely what would happen in prod
+ expect(res).toBe("μ•ˆλ…•ν•˜\nμ„Έμš”γ“\nんにけ\nγ―δΈ–η•Œ\nコンニ\nチハ你\nε₯½");
+ });
+
+ it("should handle a combination of CJK, latin, emojis and whitespaces", () => {
+ const text = `a醫 醫 bb δ½ ε₯½ world-i-πŸ˜€πŸ—ΊπŸ”₯`;
+
+ const maxWidth = 150;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe(`a醫 醫 bb δ½ \nε₯½ world-i-πŸ˜€πŸ—Ί\nπŸ”₯`);
+
+ const maxWidth2 = 50;
+ const res2 = wrapText(text, font, maxWidth2);
+ expect(res2).toBe(`a醫 醫\nbb δ½ \nε₯½\nworld\n-i-πŸ˜€\nπŸ—ΊπŸ”₯`);
+
+ const maxWidth3 = 30;
+ const res3 = wrapText(text, font, maxWidth3);
+ expect(res3).toBe(`a醫\n醫\nbb\nδ½ ε₯½\nwor\nld-\ni-\nπŸ˜€\nπŸ—Ί\nπŸ”₯`);
+ });
+
+ it("should break before and after a regular CJK character", () => {
+ const text = "HelloたWorld";
+ const maxWidth1 = 50;
+ const res1 = wrapText(text, font, maxWidth1);
+ expect(res1).toBe("Hello\nた\nWorld");
+
+ const maxWidth2 = 60;
+ const res2 = wrapText(text, font, maxWidth2);
+ expect(res2).toBe("Helloた\nWorld");
+ });
+
+ it("should break before and after certain CJK symbols", () => {
+ const text = "γ“γ‚“γ«γ‘γ―γ€ƒδΈ–η•Œ";
+ const maxWidth1 = 50;
+ const res1 = wrapText(text, font, maxWidth1);
+ expect(res1).toBe("こんにけは\nγ€ƒδΈ–η•Œ");
+
+ const maxWidth2 = 60;
+ const res2 = wrapText(text, font, maxWidth2);
+ expect(res2).toBe("こんにけは〃\nδΈ–η•Œ");
+ });
+
+ it("should break after, not before for certain CJK pairs", () => {
+ const text = "Hello γŸγ€‚";
+ const maxWidth = 70;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("Hello\nγŸγ€‚");
+ });
+
+ it("should break before, not after for certain CJK pairs", () => {
+ const text = "Helloγ€ŒγŸWorld」";
+ const maxWidth = 60;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("Hello\nγ€ŒγŸ\nWorld」");
+ });
+
+ it("should break after, not before for certain CJK character pairs", () => {
+ const text = "γ€ŒHelloγŸγ€World";
+ const maxWidth = 70;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("γ€ŒHello\nγŸγ€World");
+ });
+
+ it("should break Chinese sentences", () => {
+ const text = `δΈ­ε›½δ½ ε₯½οΌθΏ™ζ˜―δΈ€δΈͺ桋试。
+ζˆ‘δ»¬ζ₯ηœ‹ηœ‹οΌšδΊΊζ°‘εΈΒ₯1234γ€ŒεΎˆθ΄΅γ€
+οΌˆζ‹¬ε·οΌ‰γ€ι€—ε·οΌŒε₯号。空格 ζ’θ‘Œγ€€ε…¨θ§’η¬¦ε·β€¦β€”`;
+
+ const maxWidth1 = 80;
+ const res1 = wrapText(text, font, maxWidth1);
+ expect(res1).toBe(`δΈ­ε›½δ½ ε₯½οΌθΏ™ζ˜―δΈ€\nδΈͺ桋试。
+ζˆ‘δ»¬ζ₯ηœ‹ηœ‹οΌšδΊΊζ°‘\n币Β₯1234γ€ŒεΎˆ\n贡」
+οΌˆζ‹¬ε·οΌ‰γ€ι€—ε·οΌŒ\nε₯号。空格 捒葌\n全角符号…—`);
+
+ const maxWidth2 = 50;
+ const res2 = wrapText(text, font, maxWidth2);
+ expect(res2).toBe(`δΈ­ε›½δ½ ε₯½οΌ\nθΏ™ζ˜―δΈ€δΈͺζ΅‹\n试。
+ζˆ‘δ»¬ζ₯ηœ‹\nηœ‹οΌšδΊΊζ°‘εΈ\nΒ₯1234\nγ€ŒεΎˆθ΄΅γ€
+οΌˆζ‹¬ε·οΌ‰γ€\nι€—ε·οΌŒε₯\n号。空格\nζ’θ‘Œγ€€ε…¨θ§’\n符号…—`);
+ });
+
+ it("should break Japanese sentences", () => {
+ const text = `ζ—₯ζœ¬γ“γ‚“γ«γ‘γ―οΌγ“γ‚Œγ―γƒ†γ‚Ήγƒˆγ§γ™γ€‚
+ θ¦‹γ¦γΏγΎγ—γ‚‡γ†οΌšε††οΏ₯1234γ€Œι«˜γ„γ€
+ οΌˆζ‹¬εΌ§οΌ‰γ€θͺ­η‚Ήγ€ε₯点。
+ η©Ίη™½ ζ”Ήθ‘Œγ€€ε…¨θ§’θ¨˜ε·β€¦γƒΌ`;
+
+ const maxWidth1 = 80;
+ const res1 = wrapText(text, font, maxWidth1);
+ expect(res1).toBe(`ζ—₯ζœ¬γ“γ‚“γ«γ‘γ―οΌ\nγ“γ‚Œγ―γƒ†γ‚Ήγƒˆγ§\nす。
+ 見てみましょ\nγ†οΌšε††οΏ₯1234\nγ€Œι«˜γ„γ€
+ οΌˆζ‹¬εΌ§οΌ‰γ€θͺ­\n点、ε₯点。
+ η©Ίη™½ ζ”Ήθ‘Œ\nε…¨θ§’θ¨˜ε·β€¦γƒΌ`);
+
+ const maxWidth2 = 50;
+ const res2 = wrapText(text, font, maxWidth2);
+ expect(res2).toBe(`ζ—₯ζœ¬γ“γ‚“γ«\nγ‘γ―οΌγ“γ‚Œ\nγ―γƒ†γ‚Ήγƒˆγ§\nす。
+ 見てみ\nγΎγ—γ‚‡γ†οΌš\n円\nοΏ₯1234\nγ€Œι«˜γ„γ€
+ οΌˆζ‹¬\n弧)、θͺ­\n点、ε₯点。
+ η©Ίη™½\nζ”Ήθ‘Œγ€€ε…¨θ§’\nθ¨˜ε·β€¦γƒΌ`);
+ });
+
+ it("should break Korean sentences", () => {
+ const text = `ν•œκ΅­ μ•ˆλ…•ν•˜μ„Έμš”! 이것은 ν…ŒμŠ€νŠΈμž…λ‹ˆλ‹€.
+우리 보자: 원화₩1234γ€ŒλΉ„μ‹Έλ‹€γ€
+(κ΄„ν˜Έ), μ‰Όν‘œ, λ§ˆμΉ¨ν‘œ.
+곡백 μ€„λ°”κΏˆγ€€μ „κ°κΈ°ν˜Έβ€¦β€”`;
+
+ const maxWidth1 = 80;
+ const res1 = wrapText(text, font, maxWidth1);
+ expect(res1).toBe(`ν•œκ΅­ μ•ˆλ…•ν•˜μ„Έ\nμš”! 이것은 ν…Œ\nμŠ€νŠΈμž…λ‹ˆλ‹€.
+우리 보자: 원\nν™”β‚©1234γ€ŒλΉ„\n싸닀」
+(κ΄„ν˜Έ), μ‰Ό\nν‘œ, λ§ˆμΉ¨ν‘œ.
+곡백 μ€„λ°”κΏˆγ€€μ „\nκ°κΈ°ν˜Έβ€¦β€”`);
+
+ const maxWidth2 = 60;
+ const res2 = wrapText(text, font, maxWidth2);
+ expect(res2).toBe(`ν•œκ΅­ μ•ˆλ…•ν•˜\nμ„Έμš”! 이것\n은 ν…ŒμŠ€νŠΈμž…\nλ‹ˆλ‹€.
+우리 보자:\n원화\nβ‚©1234\nγ€ŒλΉ„μ‹Έλ‹€γ€
+(κ΄„ν˜Έ),\nμ‰Όν‘œ, 마침\nν‘œ.
+곡백 μ€„λ°”κΏˆ\nμ „κ°κΈ°ν˜Έβ€¦β€”`);
+ });
+ });
+
+ describe("When text contains leading whitespaces", () => {
+ const text = " \t Hello world";
+
+ it("should preserve leading whitespaces", () => {
+ const maxWidth = 120;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe(" \t Hello\nworld");
+ });
+
+ it("should break and collapse leading whitespaces when line breaks", () => {
+ const maxWidth = 60;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("\nHello\nworld");
+ });
+
+ it("should break and collapse leading whitespaces whe words break", () => {
+ const maxWidth = 30;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("\nHel\nlo\nwor\nld");
+ });
+ });
+
+ describe("When text contains trailing whitespaces", () => {
+ it("shouldn't add new lines for trailing spaces", () => {
+ const text = "Hello whats up ";
+ const maxWidth = 190;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe(text);
+ });
+
+ it("should ignore trailing whitespaces when line breaks", () => {
+ const text = "Hippopotomonstrosesquippedaliophobia ??????";
+ const maxWidth = 400;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("Hippopotomonstrosesquippedaliophobia\n??????");
+ });
+
+ it("should not ignore trailing whitespaces when word breaks", () => {
+ const text = "Hippopotomonstrosesquippedaliophobia ??????";
+ const maxWidth = 300;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("Hippopotomonstrosesquippedalio\nphobia ??????");
+ });
+
+ it("should ignore trailing whitespaces when word breaks and line breaks", () => {
+ const text = "Hippopotomonstrosesquippedaliophobia ??????";
+ const maxWidth = 180;
+ const res = wrapText(text, font, maxWidth);
+ expect(res).toBe("Hippopotomonstrose\nsquippedaliophobia\n??????");
+ });
+ });
+
+ describe("When text doesn't contain new lines", () => {
+ const text = "Hello whats up";
+
+ [
+ {
+ desc: "break all words when width of each word is less than container width",
+ width: 70,
+ res: `Hello\nwhats\nup`,
+ },
+ {
+ desc: "break all characters when width of each character is less than container width",
+ width: 15,
+ res: `H\ne\nl\nl\no\nw\nh\na\nt\ns\nu\np`,
+ },
+ {
+ desc: "break words as per the width",
+
+ width: 130,
+ res: `Hello whats\nup`,
+ },
+ {
+ desc: "fit the container",
+
+ width: 240,
+ res: "Hello whats up",
+ },
+ {
+ desc: "push the word if its equal to max width",
+ width: 50,
+ res: `Hello\nwhats\nup`,
+ },
+ ].forEach((data) => {
+ it(`should ${data.desc}`, () => {
+ const res = wrapText(text, font, data.width);
+ expect(res).toEqual(data.res);
+ });
+ });
+ });
+
+ describe("When text contain new lines", () => {
+ const text = `Hello\n whats up`;
+ [
+ {
+ desc: "break all words when width of each word is less than container width",
+ width: 70,
+ res: `Hello\n whats\nup`,
+ },
+ {
+ desc: "break all characters when width of each character is less than container width",
+ width: 15,
+ res: `H\ne\nl\nl\no\n\nw\nh\na\nt\ns\nu\np`,
+ },
+ {
+ desc: "break words as per the width",
+ width: 140,
+ res: `Hello\n whats up`,
+ },
+ ].forEach((data) => {
+ it(`should respect new lines and ${data.desc}`, () => {
+ const res = wrapText(text, font, data.width);
+ expect(res).toEqual(data.res);
+ });
+ });
+ });
+
+ describe("When text is long", () => {
+ const text = `hellolongtextthisiswhatsupwithyouIamtypingggggandtypinggg break it now`;
+ [
+ {
+ desc: "fit characters of long string as per container width",
+ width: 160,
+ res: `hellolongtextthi\nsiswhatsupwithyo\nuIamtypingggggan\ndtypinggg break\nit now`,
+ },
+ {
+ desc: "fit characters of long string as per container width and break words as per the width",
+
+ width: 120,
+ res: `hellolongtex\ntthisiswhats\nupwithyouIam\ntypingggggan\ndtypinggg\nbreak it now`,
+ },
+ {
+ desc: "fit the long text when container width is greater than text length and move the rest to next line",
+
+ width: 590,
+ res: `hellolongtextthisiswhatsupwithyouIamtypingggggandtypinggg\nbreak it now`,
+ },
+ ].forEach((data) => {
+ it(`should ${data.desc}`, () => {
+ const res = wrapText(text, font, data.width);
+ expect(res).toEqual(data.res);
+ });
+ });
+ });
+
+ describe("Test parseTokens", () => {
+ it("should tokenize latin", () => {
+ let text = "Excalidraw is a virtual collaborative whiteboard";
+
+ expect(parseTokens(text)).toEqual([
+ "Excalidraw",
+ " ",
+ "is",
+ " ",
+ "a",
+ " ",
+ "virtual",
+ " ",
+ "collaborative",
+ " ",
+ "whiteboard",
+ ]);
+
+ text =
+ "Wikipedia is hosted by Wikimedia- Foundation, a non-profit organization that also hosts a range-of other projects";
+ expect(parseTokens(text)).toEqual([
+ "Wikipedia",
+ " ",
+ "is",
+ " ",
+ "hosted",
+ " ",
+ "by",
+ " ",
+ "Wikimedia-",
+ " ",
+ "Foundation,",
+ " ",
+ "a",
+ " ",
+ "non-",
+ "profit",
+ " ",
+ "organization",
+ " ",
+ "that",
+ " ",
+ "also",
+ " ",
+ "hosts",
+ " ",
+ "a",
+ " ",
+ "range-",
+ "of",
+ " ",
+ "other",
+ " ",
+ "projects",
+ ]);
+ });
+
+ it("should not tokenize number", () => {
+ const text = "99,100.99";
+ const tokens = parseTokens(text);
+ expect(tokens).toEqual(["99,100.99"]);
+ });
+
+ it("should tokenize joined emojis", () => {
+ const text = `πŸ˜¬πŸŒπŸ—ΊπŸ”₯β˜‚οΈπŸ‘©πŸ½β€πŸ¦°πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦πŸ‘©πŸΎβ€πŸ”¬πŸ³οΈβ€πŸŒˆπŸ§”β€β™€οΈπŸ§‘β€πŸ€β€πŸ§‘πŸ™…πŸ½β€β™‚οΈβœ…0οΈβƒ£πŸ‡¨πŸ‡ΏπŸ¦…`;
+ const tokens = parseTokens(text);
+
+ expect(tokens).toEqual([
+ "😬",
+ "🌍",
+ "πŸ—Ί",
+ "πŸ”₯",
+ "β˜‚οΈ",
+ "πŸ‘©πŸ½β€πŸ¦°",
+ "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦",
+ "πŸ‘©πŸΎβ€πŸ”¬",
+ "πŸ³οΈβ€πŸŒˆ",
+ "πŸ§”β€β™€οΈ",
+ "πŸ§‘β€πŸ€β€πŸ§‘",
+ "πŸ™…πŸ½β€β™‚οΈ",
+ "βœ…",
+ "0️⃣",
+ "πŸ‡¨πŸ‡Ώ",
+ "πŸ¦…",
+ ]);
+ });
+
+ it("should tokenize emojis mixed with mixed text", () => {
+ const text = `😬a🌍bπŸ—ΊcπŸ”₯dβ˜‚οΈγ€ŠπŸ‘©πŸ½β€πŸ¦°γ€‹πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦εΎ·πŸ‘©πŸΎβ€πŸ”¬γ“πŸ³οΈβ€πŸŒˆμ•ˆπŸ§”β€β™€οΈgπŸ§‘β€πŸ€β€πŸ§‘hπŸ™…πŸ½β€β™‚οΈeβœ…f0️⃣gπŸ‡¨πŸ‡Ώ10πŸ¦…#hash`;
+ const tokens = parseTokens(text);
+
+ expect(tokens).toEqual([
+ "😬",
+ "a",
+ "🌍",
+ "b",
+ "πŸ—Ί",
+ "c",
+ "πŸ”₯",
+ "d",
+ "β˜‚οΈ",
+ "γ€Š",
+ "πŸ‘©πŸ½β€πŸ¦°",
+ "》",
+ "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦",
+ "εΎ·",
+ "πŸ‘©πŸΎβ€πŸ”¬",
+ "こ",
+ "πŸ³οΈβ€πŸŒˆ",
+ "μ•ˆ",
+ "πŸ§”β€β™€οΈ",
+ "g",
+ "πŸ§‘β€πŸ€β€πŸ§‘",
+ "h",
+ "πŸ™…πŸ½β€β™‚οΈ",
+ "e",
+ "βœ…",
+ "f0️⃣g", // bummer, but ok, as we traded kecaps not breaking (less common) for hash and numbers not breaking (more common)
+ "πŸ‡¨πŸ‡Ώ",
+ "10", // nice! do not break the number, as it's by default matched by \p{Emoji}
+ "πŸ¦…",
+ "#hash", // nice! do not break the hash, as it's by default matched by \p{Emoji}
+ ]);
+ });
+
+ it("should tokenize decomposed chars into their composed variants", () => {
+ // each input character is in a decomposed form
+ const text = "cΜŒγ¦γ‚™aΜˆγ²γ‚šΞ΅Μα„ƒα…‘ΠΈΜ†α„’α…‘α†«";
+ expect(text.normalize("NFC").length).toEqual(8);
+ expect(text).toEqual(text.normalize("NFD"));
+
+ const tokens = parseTokens(text);
+ expect(tokens.length).toEqual(8);
+ expect(tokens).toEqual(["č", "で", "Γ€", "ぴ", "Ξ­", "λ‹€", "ΠΉ", "ν•œ"]);
+ });
+
+ it("should tokenize artificial CJK", () => {
+ const text = `γ€Šι“εΎ·ηΆ“γ€‹ι†«-ι†«γ“γ‚“γ«γ‘γ―δΈ–η•ŒοΌμ•ˆλ…•ν•˜μ„Έμš”μ„Έκ³„οΌ›μš”γ€,λ‹€.λ‹€...원/달(((λ‹€)))[[1]]γ€š({((ν•œ))>)γ€›(γ€ŒγŸγ€)γŸβ€¦[Hello] \tγ€€WorldοΌŸγƒ‹γƒ₯ーヨーク・οΏ₯3700.55す。090-1234-5678οΏ₯1,000γ€œοΌ„5,000γ€Œη΄ ζ™΄γ‚‰γ—γ„οΌγ€γ€”ι‡θ¦γ€•οΌƒοΌ‘οΌšTaro君30οΌ…γ―γ€οΌˆγŸγͺγ°γŸοΌ‰γ€°οΏ₯110Β±οΏ₯570で20β„ƒγ€œ9:30γ€œ10:00【一η•ͺ】`;
+ // [
+ // 'γ€Šι“', 'εΎ·', '碓》', '醫-',
+ // '醫', 'こ', 'γ‚“', 'に',
+ // 'け', 'は', 'δΈ–', 'η•ŒοΌ',
+ // 'μ•ˆ', 'λ…•', 'ν•˜', 'μ„Έ',
+ // 'μš”', 'μ„Έ', '계;', 'μš”γ€,',
+ // 'λ‹€.', 'λ‹€...', '원/', '달',
+ // '(((λ‹€)))', '[[1]]', 'γ€š({((ν•œ))>)γ€›', '(γ€ŒγŸγ€)',
+ // 'γŸβ€¦', '[Hello]', ' ', '\t',
+ // 'γ€€', 'World?', 'ニ', 'γƒ₯',
+ // 'γƒΌ', 'ヨ', 'γƒΌ', 'ク・',
+ // 'οΏ₯3700.55', 'す。', '090-', '1234-',
+ // '5678', 'οΏ₯1,000γ€œ', 'οΌ„5,000', 'γ€Œη΄ ',
+ // 'ζ™΄', 'ら', 'し', 'い!」',
+ // '〔重', '要〕', 'οΌƒ', 'οΌ‘οΌš',
+ // 'Taro', '君', '30οΌ…', 'は、',
+ // '(た', 'γͺ', 'ば', 'γŸοΌ‰',
+ // 'γ€°', 'οΏ₯110Β±', 'οΏ₯570', 'で',
+ // '20β„ƒγ€œ', '9:30γ€œ', '10:00', '【一',
+ // 'η•ͺ】'
+ // ]
+ const tokens = parseTokens(text);
+
+ // Latin
+ expect(tokens).toContain("[[1]]");
+ expect(tokens).toContain("[Hello]");
+ expect(tokens).toContain("World?");
+ expect(tokens).toContain("Taro");
+
+ // Chinese
+ expect(tokens).toContain("γ€Šι“");
+ expect(tokens).toContain("εΎ·");
+ expect(tokens).toContain("碓》");
+ expect(tokens).toContain("醫-");
+ expect(tokens).toContain("醫");
+
+ // Japanese
+ expect(tokens).toContain("こ");
+ expect(tokens).toContain("γ‚“");
+ expect(tokens).toContain("に");
+ expect(tokens).toContain("け");
+ expect(tokens).toContain("は");
+ expect(tokens).toContain("δΈ–");
+ expect(tokens).toContain("ク・");
+ expect(tokens).toContain("η•ŒοΌ");
+ expect(tokens).toContain("γŸβ€¦");
+ expect(tokens).toContain("す。");
+ expect(tokens).toContain("γƒ₯");
+ expect(tokens).toContain("γ€Œη΄ ");
+ expect(tokens).toContain("ζ™΄");
+ expect(tokens).toContain("ら");
+ expect(tokens).toContain("し");
+ expect(tokens).toContain("い!」");
+ expect(tokens).toContain("君");
+ expect(tokens).toContain("は、");
+ expect(tokens).toContain("(た");
+ expect(tokens).toContain("γͺ");
+ expect(tokens).toContain("ば");
+ expect(tokens).toContain("γŸοΌ‰");
+ expect(tokens).toContain("で");
+ expect(tokens).toContain("【一");
+ expect(tokens).toContain("η•ͺ】");
+
+ // Check for Korean
+ expect(tokens).toContain("μ•ˆ");
+ expect(tokens).toContain("λ…•");
+ expect(tokens).toContain("ν•˜");
+ expect(tokens).toContain("μ„Έ");
+ expect(tokens).toContain("μš”");
+ expect(tokens).toContain("μ„Έ");
+ expect(tokens).toContain("계;");
+ expect(tokens).toContain("μš”γ€,");
+ expect(tokens).toContain("λ‹€.");
+ expect(tokens).toContain("λ‹€...");
+ expect(tokens).toContain("원/");
+ expect(tokens).toContain("달");
+ expect(tokens).toContain("(((λ‹€)))");
+ expect(tokens).toContain("γ€š({((ν•œ))>)γ€›");
+ expect(tokens).toContain("(γ€ŒγŸγ€)");
+
+ // Numbers and units
+ expect(tokens).toContain("οΏ₯3700.55");
+ expect(tokens).toContain("090-");
+ expect(tokens).toContain("1234-");
+ expect(tokens).toContain("5678");
+ expect(tokens).toContain("οΏ₯1,000γ€œ");
+ expect(tokens).toContain("οΌ„5,000");
+ expect(tokens).toContain("οΌ‘οΌš");
+ expect(tokens).toContain("30οΌ…");
+ expect(tokens).toContain("οΏ₯110Β±");
+ expect(tokens).toContain("20β„ƒγ€œ");
+ expect(tokens).toContain("9:30γ€œ");
+ expect(tokens).toContain("10:00");
+
+ // Punctuation and symbols
+ expect(tokens).toContain(" ");
+ expect(tokens).toContain("\t");
+ expect(tokens).toContain("γ€€");
+ expect(tokens).toContain("ニ");
+ expect(tokens).toContain("γƒΌ");
+ expect(tokens).toContain("ヨ");
+ expect(tokens).toContain("γ€°");
+ expect(tokens).toContain("οΌƒ");
+ });
+ });
+});