Module:Unicode data
From Vigyanwiki
Documentation for this module may be created at Module:Unicode data/doc
local export = {}
local floor = math.floor
-- The following leads, vowels, and trails come from here:
-- http://www.unicode.org/Public/UNIDATA/Jamo.txt
-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local hangul_leads = {
[0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS",
"", "J", "JJ", "C", "K", "T", "P", "H"
}
-- not needed:
-- hangul_leads.length = #hangul_leads + 1
local hangul_vowels = {
[0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
"I"
}
hangul_vowel_count = #hangul_vowels + 1
local hangul_trails = {
[0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB",
"LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K",
"T", "P", "H"
}
hangul_trail_count = #hangul_trails + 1
hangul_coda_count = hangul_vowel_count * hangul_trail_count
local name_hooks = {
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters
{ 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters
{ 0x3400, 0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{ 0x4E00, 0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph --change v10
{ 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables
local syllable_index = codepoint - 0xAC00
return ("HANGUL SYLLABLE %s%s%s"):format(
hangul_leads[floor(syllable_index / hangul_coda_count)],
hangul_vowels[floor((syllable_index % hangul_coda_count) / hangul_trail_count)],
hangul_trails[syllable_index % hangul_trail_count]
)
end },
-- Non Private Use High Surrogate, Private Use High Surrogate, Low Surrogate
{ 0xD800, 0xDFFF, "<surrogate-%04X>" },
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use
{ 0x17000, 0x187F1, "TANGUT IDEOGRAPH-%05X" }, -- Tangut
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%05X" }, -- Nushu --add v10
{ 0x20000, 0x2A6D6, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension B
{ 0x2A700, 0x2B734, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension C
{ 0x2A740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension D
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension E
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension F
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%05X" },
{ 0xF0000, 0xFFFFD, "<private-use-%05X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%06X>" } -- Plane 16 Private Use
}
local name_range_cache
local function generate_name(data, codepoint)
if type(data) == "string" then
return data:format(codepoint)
else
return data(codepoint)
end
end
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function export.lookup_name(codepoint)
if codepoint < 0 or 0x10FFFF < codepoint then
error(("Codepoint %04X out of range"):format(codepoint))
end
-- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are Unassigned
-- (Cn) and specifically noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
or floor(codepoint % 0x10000) >= 0xFFFE) then
return ("<noncharacter-%04X>"):format(codepoint)
end
if name_range_cache -- Check if previously used "name hook" applies to this codepoint.
and codepoint >= name_range_cache[1]
and codepoint <= name_range_cache[2] then
return generate_name(name_range_cache[3], codepoint)
end
for _, item in ipairs(name_hooks) do
if codepoint < item[1] then
break
elseif codepoint <= item[2] then
-- Save "name hook" in case another character
-- from the same range will be looked up in the same module invocation.
name_range_cache = item
return generate_name(item[3], codepoint)
end
end
local success, data = pcall(mw.loadData,
('Module:Unicode data/names/%03X'):format(codepoint / 0x1000))
if success and data[codepoint] then
return data[codepoint]
-- Unassigned (Cn) consists of noncharacters and reserved characters.
-- The character has been established not to be a noncharacter,
-- and if it were assigned, its name would already been retrieved,
-- so it must be reserved.
else
return ("<reserved-%04X>"):format(codepoint)
end
end
function export.lookup_image(codepoint)
local success, data = pcall(mw.loadData,
('Module:Unicode data/images/%03X'):format(codepoint / 0x1000)
)
if success then
return data[codepoint]
end
end
function export.template_lookup_name(frame)
local codepoint = tonumber(frame.args[1] or frame:getParent().args[1])
local name = export.lookup_name(codepoint)
return name:gsub("<", "<")
end
local planes = {
[ 0] = "Basic Multilingual Plane";
[ 1] = "Supplementary Multilingual Plane";
[ 2] = "Supplementary Ideographic Plane";
[13] = "Supplementary Special-purpose Plane";
[14] = "Supplementary Private Use Area-A";
[15] = "Supplementary Private Use Area-B";
}
local blocks
function export.enum_blocks()
blocks = blocks or mw.loadData("Module:Unicode data/blocks")
return function (blocks, i)
i = i + 1
local data = blocks[i]
if not data then
return nil
end
return i, unpack(data)
end, blocks, 0
end
function export.lookup_plane(codepoint)
local i = floor(codepoint / 0x10000)
return planes[i] or ("Plane %u"):format(i)
end
-- Binary search, to avoid iterating over entire table in order to look up the
-- higher codepoints.
function export.lookup_block(codepoint)
blocks = blocks or mw.loadData("Module:Unicode data/blocks")
local iStart, iEnd = 1, blocks.length or #blocks
while iStart <= iEnd do
local iMid = floor((iStart + iEnd) / 2)
local range = blocks[iMid]
if codepoint < range[2] then
iEnd = iMid - 1
elseif codepoint <= range[3] then
return range[1]
else
iStart = iMid + 1
end
end
error(string.format("No block found for codepoint U+%04X.", codepoint))
end
function export.get_block_range(name)
local range
blocks = blocks or mw.loadData("Module:Unicode data/blocks")
for i, block in ipairs(blocks) do
if block[1] == name then
range = block
end
end
if range then
return range[2], range[3]
end
end
function export.is_valid_pagename(pagename)
local has_nonws = false
for cp in mw.ustring.gcodepoint(pagename) do
if (cp == 0x0023) -- #
or (cp == 0x005B) -- [
or (cp == 0x005D) -- ]
or (cp == 0x007B) -- {
or (cp == 0x007C) -- |
or (cp == 0x007D) -- }
or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
then
return false
end
local printable, result = export.is_printable(cp)
if not printable then
return false
end
if result ~= "space-separator" then
has_nonws = true
end
end
return has_nonws
end
local function manual_unpack(what, from)
local result = {}
from = from or 1
for i, item in ipairs(what) do
if i >= from then
table.insert(result, item)
end
end
return unpack(result)
end
local function memo_lookup(loader, match_func, ...)
local dots = { ... }
local cache = {}
local singles, ranges
return function (codepoint)
if not singles then
singles, ranges = loader()
end
if singles[codepoint] then
return match_func(codepoint, singles[codepoint])
end
local lastlast = -1
for _, range in pairs(cache) do
if (range[1] <= codepoint) and (codepoint <= range[2]) then
return match_func(codepoint, unpack(range, 3))
end
end
for _, range in pairs(ranges) do
if codepoint < range[1] then
table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) })
return match_func(codepoint, unpack(dots))
elseif codepoint <= range[2] then
table.insert(cache, { manual_unpack(range) })
return match_func(codepoint, manual_unpack(range, 3))
else
lastlast = range[2]
end
end
return match_func(codepoint)
end
end
-- Get a codepoint's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
export.is_combining = memo_lookup(function ()
local m_comb = mw.loadData('Module:Unicode data/combining')
return m_comb.single, m_comb.ranges
end, function (codepoint, combining_class)
return combining_class and combining_class ~= 0
or false
end, 0)
function export.add_dotted_circle(str)
return (mw.ustring.gsub(str, ".",
function(char)
if export.is_combining(mw.ustring.codepoint(char)) then
return '◌' .. char
end
end))
end
local lookup_control = memo_lookup(function ()
local m_cc = mw.loadData('Module:Unicode data/control')
return m_cc.single, m_cc.ranges
end, function (codepoint, ccc)
return ccc or "assigned"
end, "assigned")
function export.is_assigned(codepoint)
return lookup_control(codepoint) ~= "unassigned"
end
function export.is_printable(codepoint)
local result = lookup_control(codepoint)
return (result == "assigned") or (result == "space-separator"), result
end
function export.is_whitespace(codepoint)
local result = lookup_control(codepoint)
return (result == "space-separator"), result
end
local unsupported_title = {
[0x0020] = "Unsupported titles/Space";
[0x0023] = "Unsupported titles/Number sign";
[0x002E] = "Unsupported titles/Full stop";
[0x003A] = "Unsupported titles/Colon";
[0x003C] = "Unsupported titles/Less than";
[0x003E] = "Unsupported titles/Greater than";
[0x005B] = "Unsupported titles/Left square bracket";
[0x005D] = "Unsupported titles/Right square bracket";
[0x005F] = "Unsupported titles/Low line";
[0x007B] = "Unsupported titles/Left curly bracket";
[0x007C] = "Unsupported titles/Vertical line";
[0x007D] = "Unsupported titles/Right curly bracket";
[0x1680] = "Unsupported titles/Ogham space";
[0xFFFD] = "Unsupported titles/Replacement character";
}
function export.get_entry_title(codepoint)
if unsupported_title[codepoint] then
return unsupported_title[codepoint]
end
if lookup_control(codepoint) ~= "assigned" then
return nil
end
return mw.ustring.char(codepoint)
end
return export