๐ฐUnicode
๐ง under construction -> tidy this page
JS โฉ value โฉ primitive โฉ String โฉ Unicode
every Unicode character is assigned a code point.
code points are divided into 17 code planes.
one or more code points can be combined into a single grapheme cluster.
character encoding transforms code points into code units.
most JavaScript engines use UTF-16 encoding.
const {log} = console;
const convert = require('./Converter.js');
[
convert.stringToCodeUnits("๐"), // [ 55356=a, 57166=b ]
/๐{3}/.test("๐๐๐"), // falseโ๏ธ
// assume ๐ = ab (2 code units)
// ๐{3} = ab{3} = abbb โ ababab = ๐๐๐
convert.stringToCodeUnits("๐น"), // [ 55356=a, 57145=c ]
/<.>/.test("<๐น>"), // falseโ๏ธ
// assumn ๐น = ac (2 code units)
// <.> != <ac>
/<.>/u.test("<๐น>"), // true โญ๏ธ
// โ
enable /u flag
// ๆๅฐใๆผขๅญใ
`Hello ะัะธะฒะตั ไฝ ๅฅฝ`.match(/\p{sc=Han}/gu), // [ 'ไฝ ', 'ๅฅฝ' ]
// Script
/\p{Script=Greek}/u.test("ฮฑ"), // โ true
/\p{Script=Arabic}/u.test("ฮฑ"), // โ false
// Alphabetic
/\p{Alphabetic}/u.test("ฮฑ"), // โ true
/\p{Alphabetic}/u.test("!"), // โ false
/\p{Alphabetic}/u.test("ๆผข"), // โ true
].forEach(x => log(x));
main categories and subcategories
Letter
L
:lowercase
Ll
modifier
Lm
,titlecase
Lt
,uppercase
Lu
,other
Lo
.
Number
N
:decimal digit
Nd
,letter number
Nl
,other
No
.
Punctuation
P
:connector
Pc
,dash
Pd
,initial quote
Pi
,final quote
Pf
,open
Ps
,close
Pe
,other
Po
.
Mark
M
(accents etc):spacing combining
Mc
,enclosing
Me
,non-spacing
Mn
.
Symbol
S
:currency
Sc
,modifier
Sk
,math
Sm
,other
So
.
Separator
Z
:line
Zl
,paragraph
Zp
,space
Zs
.
Other
C
:control
Cc
,format
Cf
,not assigned
Cn
,private use
Co
,surrogate
Cs
.
Last updated
Was this helpful?