🔰Unicode
🚧 under construction -> tidy this page
JS ⟩ value ⟩ primitive ⟩ String ⟩ Unicode
every Unicode character is assigned a code point.
code points are divided into 17 code planes.
one or more code points can be combined into a single grapheme cluster.
character encoding transforms code points into code units.
most JavaScript engines use UTF-16 encoding.
const {log} = console;
const convert = require('./Converter.js');
[
convert.stringToCodeUnits("🍎"), // [ 55356=a, 57166=b ]
/🍎{3}/.test("🍎🍎🍎"), // false❗️
// assume 🍎 = ab (2 code units)
// 🍎{3} = ab{3} = abbb ≠ ababab = 🍎🍎🍎
convert.stringToCodeUnits("🌹"), // [ 55356=a, 57145=c ]
/<.>/.test("<🌹>"), // false❗️
// assumn 🌹 = ac (2 code units)
// <.> != <ac>
/<.>/u.test("<🌹>"), // true ⭐️
// ✅ enable /u flag
// 搜尋「漢字」
`Hello Привет 你好`.match(/\p{sc=Han}/gu), // [ '你', '好' ]
// Script
/\p{Script=Greek}/u.test("α"), // → true
/\p{Script=Arabic}/u.test("α"), // → false
// Alphabetic
/\p{Alphabetic}/u.test("α"), // → true
/\p{Alphabetic}/u.test("!"), // → false
/\p{Alphabetic}/u.test("漢"), // → true
].forEach(x => log(x));
main categories and subcategories
Letter
L
:lowercase
Ll
modifier
Lm
,titlecase
Lt
,uppercase
Lu
,other
Lo
.
Number
N
:decimal digit
Nd
,letter number
Nl
,other
No
.
Punctuation
P
:connector
Pc
,dash
Pd
,initial quote
Pi
,final quote
Pf
,open
Ps
,close
Pe
,other
Po
.
Mark
M
(accents etc):spacing combining
Mc
,enclosing
Me
,non-spacing
Mn
.
Symbol
S
:currency
Sc
,modifier
Sk
,math
Sm
,other
So
.
Separator
Z
:line
Zl
,paragraph
Zp
,space
Zs
.
Other
C
:control
Cc
,format
Cf
,not assigned
Cn
,private use
Co
,surrogate
Cs
.
Last updated
Was this helpful?