🔰Unicode
🚧 under construction -> tidy this page
Last updated
Was this helpful?
🚧 under construction -> tidy this page
Last updated
Was this helpful?
Was this helpful?
JS ⟩ value ⟩ primitive ⟩ String ⟩ Unicode
every Unicode character is assigned a code point.
code points are divided into 17 code planes.
one or more code points can be combined into a single grapheme cluster.
character encoding transforms code points into code units.
most JavaScript engines use UTF-16 encoding.
Letter L
:
lowercase Ll
modifier Lm
,
titlecase Lt
,
uppercase Lu
,
other Lo
.
Number N
:
decimal digit Nd
,
letter number Nl
,
Punctuation P
:
connector Pc
,
dash Pd
,
Mark M
(accents etc):
spacing combining Mc
,
enclosing Me
,
Symbol S
:
currency Sc
,
modifier Sk
,
Separator Z
:
line Zl
,
paragraph Zp
,
Other C
:
control Cc
,
format Cf
,
Enable "/u
" flag to support Unicode in regular expressions.
other No
.
initial quote Pi
,
final quote Pf
,
open Ps
,
close Pe
,
other Po
.
non-spacing Mn
.
math Sm
,
other So
.
space Zs
.
not assigned Cn
,
private use Co
,
surrogate Cs
.
const {log} = console;
const convert = require('./Converter.js');
[
convert.stringToCodeUnits("🍎"), // [ 55356=a, 57166=b ]
/🍎{3}/.test("🍎🍎🍎"), // false❗️
// assume 🍎 = ab (2 code units)
// 🍎{3} = ab{3} = abbb ≠ ababab = 🍎🍎🍎
convert.stringToCodeUnits("🌹"), // [ 55356=a, 57145=c ]
/<.>/.test("<🌹>"), // false❗️
// assumn 🌹 = ac (2 code units)
// <.> != <ac>
/<.>/u.test("<🌹>"), // true ⭐️
// ✅ enable /u flag
// 搜尋「漢字」
`Hello Привет 你好`.match(/\p{sc=Han}/gu), // [ '你', '好' ]
// Script
/\p{Script=Greek}/u.test("α"), // → true
/\p{Script=Arabic}/u.test("α"), // → false
// Alphabetic
/\p{Alphabetic}/u.test("α"), // → true
/\p{Alphabetic}/u.test("!"), // → false
/\p{Alphabetic}/u.test("漢"), // → true
].forEach(x => log(x));
const {log} = console;
/*
- Alpha (Alphabetic) : letters
- M (Mark) : accents
- Nd (Decimal_Number) : digits
- Pc (Connector_Punctuation): underscore '_' and similar characters,
- Po (Punctuation others) : ?
- Join_C (Join_Control) : (200c, 200d) used in ligatures, e.g. in Arabic.
*/
// multi-language "word" character (like "\w", but in a Unicode sense)
let char = /[\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]/gu;
let Po = /[\p{Po}]/gu;
let P = /[\p{P}]/gu;
let str = `Hi、你好,(+US$12.34-)。`;
[
str.match(char), // ["H","i","你","好","U","S","1","2","3","4"]
str.match(Po), // ["、",",",".","。"] ⭐️
str.match(P), // ["、",",","(",".","-",")","。"]
].forEach(x => log(x));j
\p{Script=Han}
\p{sc=Han}
漢字
\p{Letter}
\p{L}
a letter in any language.
\p{Number}
\p{N}
digit
\p{Po}
punctuation (others)
“,” 就屬於此類 ⭐️