Module:Unicode data: Difference between revisions
From Vigyanwiki
Template>Erutuon (binary search in name_hooks: possibly more efficient) |
m (59 revisions imported from alpha:Module:Unicode_data) |
||
(41 intermediate revisions by 10 users not shown) | |||
Line 2: | Line 2: | ||
local floor = math.floor | local floor = math.floor | ||
local function errorf(level, ...) | |||
if type(level) == "number" then | |||
return error(string.format(...), level + 1) | |||
else -- level is actually the format string. | |||
return error(string.format(level, ...), 2) | |||
end | |||
end | |||
local function binary_range_search(codepoint, ranges) | |||
local low, mid, high | |||
low, high = 1, ranges.length or require "Module:TableTools".length(ranges) | |||
while low <= high do | |||
mid = floor((low + high) / 2) | |||
local range = ranges[mid] | |||
if codepoint < range[1] then | |||
high = mid - 1 | |||
elseif codepoint <= range[2] then | |||
return range, mid | |||
else | |||
low = mid + 1 | |||
end | |||
end | |||
return nil, mid | |||
end | |||
p.binary_range_search = binary_range_search | |||
--[[ | |||
local function linear_range_search(codepoint, ranges) | |||
for i, range in ipairs(ranges) do | |||
if range[1] <= codepoint and codepoint <= range[2] then | |||
return range | |||
end | |||
end | |||
end | |||
--]] | |||
-- Load a module by indexing "loader" with the name of the module minus the | |||
-- "Module:Unicode data/" part. For instance, loader.blocks returns | |||
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be | |||
-- returned. | |||
local loader = setmetatable({}, { | |||
__index = function (self, key) | |||
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key) | |||
if not success then | |||
data = false | |||
end | |||
self[key] = data | |||
return data | |||
end | |||
}) | |||
-- For the algorithm used to generate Hangul Syllable names, | -- For the algorithm used to generate Hangul Syllable names, | ||
Line 7: | Line 58: | ||
-- Unicode Specification: | -- Unicode Specification: | ||
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | ||
local name_hooks = { | local name_hooks = { | ||
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters | { 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters | ||
{ 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters | { 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters | ||
{ 0x3400, | { 0x3400, 0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A | ||
{ 0x4E00, | { 0x4E00, 0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph | ||
{ 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables | { 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables | ||
Hangul_data = | local Hangul_data = loader.Hangul | ||
local syllable_index = codepoint - 0xAC00 | local syllable_index = codepoint - 0xAC00 | ||
Line 28: | Line 77: | ||
{ 0xD800, 0xDFFF, "<surrogate-%04X>" }, | { 0xD800, 0xDFFF, "<surrogate-%04X>" }, | ||
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use | { 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use | ||
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | -- CJK Compatibility Ideographs | ||
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | |||
{ 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | { 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | ||
{ 0x17000, | { 0x17000, 0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph | ||
{ 0x18800, | { 0x18800, 0x18AFF, function (codepoint) | ||
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | ||
end }, | end }, | ||
{ 0x18D00, 0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement | |||
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | { 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | ||
{ 0x20000, | { 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B | ||
{ 0x2A700, | { 0x2A700, 0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C | ||
{ | { 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D | ||
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | { 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | ||
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | { 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | ||
Line 45: | Line 96: | ||
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | ||
end}, | end}, | ||
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G | |||
{ 0x31350, 0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H | |||
{ 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use | { 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use | ||
{ 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use | { 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use | ||
Line 59: | Line 112: | ||
end | end | ||
end | end | ||
--[[ | |||
-- Checks that the code point is a number and in range. | |||
-- Does not check whether code point is an integer. | |||
-- Not used | |||
local function check_codepoint(funcName, argIdx, val) | |||
require 'libraryUtil'.checkType(funcName, argIdx, val, 'number') | |||
if codepoint < 0 or 0x10FFFF < codepoint then | |||
errorf("Codepoint %04X out of range", codepoint) | |||
end | |||
end | |||
--]] | |||
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 | -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 | ||
function p.lookup_name(codepoint) | function p.lookup_name(codepoint) | ||
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned | |||
-- U+FDD0-U+FDEF and all | |||
-- (Cn) and specifically noncharacters: | -- (Cn) and specifically noncharacters: | ||
-- https://www.unicode.org/faq/private_use.html#nonchar4 | -- https://www.unicode.org/faq/private_use.html#nonchar4 | ||
Line 75: | Line 135: | ||
end | end | ||
if name_range_cache -- Check if previously used "name hook" applies to this | if name_range_cache -- Check if previously used "name hook" applies to this code point. | ||
and codepoint >= name_range_cache[1] | and codepoint >= name_range_cache[1] | ||
and codepoint <= name_range_cache[2] then | and codepoint <= name_range_cache[2] then | ||
Line 81: | Line 141: | ||
end | end | ||
local | local range = binary_range_search(codepoint, name_hooks) | ||
if range then | |||
name_range_cache = range | |||
return generate_name(range[3], codepoint) | |||
end | end | ||
local | local data = loader[('names/%03X'):format(codepoint / 0x1000)] | ||
if | if data and data[codepoint] then | ||
return data[codepoint] | return data[codepoint] | ||
Line 114: | Line 161: | ||
end | end | ||
--[[ | |||
-- No image data modules on Wikipedia yet. | |||
function p.lookup_image(codepoint) | function p.lookup_image(codepoint) | ||
local | local data = loader[('images/%03X'):format(codepoint / 0x1000)] | ||
if | if data then | ||
return data[codepoint] | return data[codepoint] | ||
end | end | ||
end | end | ||
--]] | |||
local planes = { | local planes = { | ||
Line 140: | Line 176: | ||
[ 1] = "Supplementary Multilingual Plane"; | [ 1] = "Supplementary Multilingual Plane"; | ||
[ 2] = "Supplementary Ideographic Plane"; | [ 2] = "Supplementary Ideographic Plane"; | ||
[ | [ 3] = "Tertiary Ideographic Plane"; | ||
[ | [14] = "Supplementary Special-purpose Plane"; | ||
[ | [15] = "Supplementary Private Use Area-A"; | ||
[16] = "Supplementary Private Use Area-B"; | |||
} | } | ||
Line 152: | Line 189: | ||
local data = blocks[i] | local data = blocks[i] | ||
if data then | if data then | ||
return i, | -- Unpack doesn't work on tables loaded with mw.loadData. | ||
return i, data[1], data[2], data[3] | |||
end | end | ||
end | end | ||
Line 158: | Line 196: | ||
-- An ipairs-type iterator generator for the list of blocks. | -- An ipairs-type iterator generator for the list of blocks. | ||
function p.enum_blocks() | function p.enum_blocks() | ||
blocks = | local blocks = loader.blocks | ||
return block_iter, blocks, 0 | return block_iter, blocks, 0 | ||
end | end | ||
Line 167: | Line 205: | ||
end | end | ||
function p.lookup_block(codepoint) | function p.lookup_block(codepoint) | ||
blocks = | local blocks = loader.blocks | ||
local | local range = binary_range_search(codepoint, blocks) | ||
if range then | |||
return range[3] | |||
else | |||
return "No Block" | |||
end | end | ||
end | end | ||
function p. | function p.get_block_info(name) | ||
for i, block in ipairs(loader.blocks) do | |||
for i, block in ipairs(blocks) do | |||
if block[3] == name then | if block[3] == name then | ||
return block | |||
end | end | ||
end | end | ||
end | end | ||
Line 232: | Line 254: | ||
local function manual_unpack(what, from) | local function manual_unpack(what, from) | ||
if what[from + 1] == nil then | |||
return what[from] | |||
end | |||
local result = {} | local result = {} | ||
from = from or 1 | from = from or 1 | ||
Line 240: | Line 266: | ||
end | end | ||
return unpack(result) | return unpack(result) | ||
end | |||
local function compare_ranges(range1, range2) | |||
return range1[1] < range2[1] | |||
end | end | ||
-- Creates a function to look up data in a module that contains "singles" (a | -- Creates a function to look up data in a module that contains "singles" (a | ||
-- | -- code point-to-data map) and "ranges" (an array containing arrays that contain | ||
-- the low and high | -- the low and high code points of a range and the data associated with that | ||
-- range). | -- range). | ||
-- "loader" loads and returns the "singles" and "ranges" tables. | |||
-- "match_func" is passed the code point and either the data or the "dots", and | |||
-- generates the final result of the function. | |||
-- The varargs ("dots") describes the default data to be returned if there wasn't | -- The varargs ("dots") describes the default data to be returned if there wasn't | ||
-- a match. | -- a match. | ||
Line 251: | Line 284: | ||
-- already been found to match, or a range whose data is the default if there | -- already been found to match, or a range whose data is the default if there | ||
-- was no match. | -- was no match. | ||
local function memo_lookup( | local function memo_lookup(data_module_subpage, match_func, ...) | ||
local dots = { ... } | local dots = { ... } | ||
local cache = {} | local cache = {} | ||
Line 258: | Line 291: | ||
return function (codepoint) | return function (codepoint) | ||
if not singles then | if not singles then | ||
singles, ranges = | local data_module = loader[data_module_subpage] | ||
singles, ranges = data_module.singles, data_module.ranges | |||
end | end | ||
Line 265: | Line 299: | ||
end | end | ||
local | local range = binary_range_search(codepoint, cache) | ||
if range then | |||
return match_func(codepoint, manual_unpack(range, 3)) | |||
end | end | ||
local | local range, index = binary_range_search(codepoint, ranges) | ||
if range then | |||
table.insert(cache, range) | |||
table.sort(cache, compare_ranges) | |||
return match_func(codepoint, manual_unpack(range, 3)) | |||
end | end | ||
if ranges[ | if ranges[index] then | ||
local dots_range | local dots_range | ||
if codepoint > ranges[ | if codepoint > ranges[index][2] then | ||
dots_range = { | dots_range = { | ||
ranges[ | ranges[index][2] + 1, | ||
ranges[ | ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF, | ||
unpack(dots) | unpack(dots) | ||
} | } | ||
else -- codepoint < range[ | else -- codepoint < range[index][1] | ||
dots_range = { | dots_range = { | ||
ranges[ | ranges[index - 1] and ranges[index - 1][2] + 1 or 0, | ||
ranges[ | ranges[index][1] - 1, | ||
unpack(dots) | unpack(dots) | ||
} | } | ||
end | end | ||
table. | table.sort(cache, compare_ranges) | ||
end | end | ||
return match_func(codepoint) | return match_func(codepoint) | ||
Line 318: | Line 333: | ||
end | end | ||
-- Get a | -- Get a code point's combining class value in [[Module:Unicode data/combining]], | ||
-- and return whether this value is not zero. Zero is assigned as the default | -- and return whether this value is not zero. Zero is assigned as the default | ||
-- if the combining class value is not found in this data module. | -- if the combining class value is not found in this data module. | ||
-- That is, return true if character is combining, or false if it is not. | -- That is, return true if character is combining, or false if it is not. | ||
-- See | -- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for | ||
-- more information. | -- more information. | ||
p.is_combining = memo_lookup( | p.is_combining = memo_lookup( | ||
"combining", | |||
function (codepoint, combining_class) | |||
return combining_class and combining_class ~= 0 or false | |||
end, | |||
0) | |||
end, 0) | |||
function p.add_dotted_circle(str) | function p.add_dotted_circle(str) | ||
Line 341: | Line 355: | ||
end | end | ||
local lookup_control = memo_lookup( | local lookup_control = memo_lookup( | ||
"control", | |||
function (codepoint, ccc) | |||
return ccc or "assigned" | |||
end, | |||
end, "assigned") | "assigned") | ||
p.lookup_control = lookup_control | |||
function p.is_assigned(codepoint) | function p.is_assigned(codepoint) | ||
Line 362: | Line 377: | ||
end | end | ||
local | p.lookup_category = memo_lookup( | ||
[ | "category", | ||
[ | function (codepoint, category) | ||
return category | |||
end, | |||
"Cn") | |||
[ | local lookup_script = memo_lookup( | ||
[ | "scripts", | ||
function (codepoint, script_code) | |||
[ | return script_code or 'Zzzz' | ||
end, | |||
"Zzzz") | |||
[ | p.lookup_script = lookup_script | ||
[ | |||
function p.get_best_script(str) | |||
-- Check type of argument, because mw.text.decode coerces numbers to strings! | |||
require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
-- Convert HTML character references (including named character references, | |||
-- or character entities) to characters. | |||
str = mw.text.decode(str, true) | |||
local scripts = {} | |||
for codepoint in mw.ustring.gcodepoint(str) do | |||
local script = lookup_script(codepoint) | |||
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts. | |||
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then | |||
scripts[script] = true | |||
end | |||
end | |||
-- If scripts does not contain two or more keys, | |||
-- return first and only key (script code) in table. | |||
if not next(scripts, next(scripts)) then | |||
return next(scripts) | |||
end -- else return majority script, or else "Zzzz"? | |||
end | |||
function p.is_Latin(str) | |||
require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
str = mw.text.decode(str, true) | |||
-- Search for the leading bytes that introduce the UTF-8 encoding of the | |||
-- code points U+0340-U+10FFFF. If they are not found and there is at least | |||
-- one Latin-script character, the string counts as Latin, because the rest | |||
-- of the characters can only be Zyyy, Zinh, and Zzzz. | |||
-- The only scripts found below U+0370 (the first code point of the Greek | |||
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz. | |||
-- See the codepage in the [[UTF-8]] article. | |||
if not str:find "[\205-\244]" then | |||
for codepoint in mw.ustring.gcodepoint(str) do | |||
if lookup_script(codepoint) == "Latn" then | |||
return true | |||
end | |||
end | |||
end | |||
local Latn = false | |||
for codepoint in mw.ustring.gcodepoint(str) do | |||
local script = lookup_script(codepoint) | |||
if script == "Latn" then | |||
Latn = true | |||
elseif not (script == "Zyyy" or script == "Zinh" | |||
or script == "Zzzz") then | |||
return false | |||
end | |||
end | |||
return Latn | |||
end | |||
-- Checks that a string contains only characters belonging to right-to-left | |||
-- scripts, or characters of ignorable scripts. | |||
function p.is_rtl(str) | |||
require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
str = mw.text.decode(str, true) | |||
-- Search for the leading bytes that introduce the UTF-8 encoding of the | |||
-- code points U+0580-U+10FFFF. If they are not found, the string can only | |||
-- have characters from a left-to-right script, because the first code point | |||
-- in a right-to-left script is U+0591, in the Hebrew block. | |||
if not str:find "[\214-\244]" then | |||
return false | |||
end | |||
local result = false | |||
local rtl = loader.scripts.rtl | |||
for codepoint in mw.ustring.gcodepoint(str) do | |||
local script = lookup_script(codepoint) | |||
if rtl[script] then | |||
result = true | |||
elseif not (script == "Zyyy" or script == "Zinh" | |||
or script == "Zzzz") then | |||
return false | |||
end | |||
end | |||
return result | |||
end | |||
local function get_codepoint(args, arg) | |||
local codepoint_string = args[arg] | |||
or errorf(2, "Parameter %s is required", tostring(arg)) | |||
local codepoint = tonumber(codepoint_string, 16) | |||
or errorf(2, "Parameter %s is not a code point in hexadecimal base", | |||
tostring(arg)) | |||
if not (0 <= codepoint and codepoint <= 0x10FFFF) then | |||
errorf(2, "code point in parameter %s out of range", tostring(arg)) | |||
end | |||
return codepoint | |||
end | |||
local function get_func(args, arg, prefix) | |||
local suffix = args[arg] | |||
or errorf(2, "Parameter %s is required", tostring(arg)) | |||
suffix = mw.text.trim(suffix) | |||
local func_name = prefix .. suffix | |||
local func = p[func_name] | |||
or errorf(2, "There is no function '%s'", func_name) | |||
return func | |||
end | |||
function p. | -- This function allows any of the "lookup" functions to be invoked. The first | ||
if | -- parameter is the word after "lookup_"; the second parameter is the code point | ||
-- in hexadecimal base. | |||
function p.lookup(frame) | |||
local func = get_func(frame.args, 1, "lookup_") | |||
local codepoint = get_codepoint(frame.args, 2) | |||
local result = func(codepoint) | |||
if func == p.lookup_name then | |||
-- Prevent code point labels such as <control-0000> from being | |||
-- interpreted as HTML tags. | |||
result = result:gsub("<", "<") | |||
end | end | ||
return result | |||
return | end | ||
function p.is(frame) | |||
local func = get_func(frame.args, 1, "is_") | |||
-- is_Latin and is_valid_pagename take strings. | |||
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then | |||
return (func(frame.args[2])) | |||
else -- The rest take code points. | |||
local codepoint = get_codepoint(frame.args, 2) | |||
return (func(codepoint)) -- Adjust to one result. | |||
end | end | ||
end | end | ||
return p | return p |
Latest revision as of 09:18, 13 December 2022
Documentation for this module may be created at Module:Unicode data/doc
local p = {}
local floor = math.floor
local function errorf(level, ...)
if type(level) == "number" then
return error(string.format(...), level + 1)
else -- level is actually the format string.
return error(string.format(level, ...), 2)
end
end
local function binary_range_search(codepoint, ranges)
local low, mid, high
low, high = 1, ranges.length or require "Module:TableTools".length(ranges)
while low <= high do
mid = floor((low + high) / 2)
local range = ranges[mid]
if codepoint < range[1] then
high = mid - 1
elseif codepoint <= range[2] then
return range, mid
else
low = mid + 1
end
end
return nil, mid
end
p.binary_range_search = binary_range_search
--[[
local function linear_range_search(codepoint, ranges)
for i, range in ipairs(ranges) do
if range[1] <= codepoint and codepoint <= range[2] then
return range
end
end
end
--]]
-- Load a module by indexing "loader" with the name of the module minus the
-- "Module:Unicode data/" part. For instance, loader.blocks returns
-- [[Module:Unicode data/blocks]]. If a module cannot be loaded, false will be
-- returned.
local loader = setmetatable({}, {
__index = function (self, key)
local success, data = pcall(mw.loadData, "Module:Unicode data/" .. key)
if not success then
data = false
end
self[key] = data
return data
end
})
-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local name_hooks = {
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters
{ 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters
{ 0x3400, 0x4DBF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{ 0x4E00, 0x9FFF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
{ 0xAC00, 0xD7A3, function (codepoint) -- Hangul Syllables
local Hangul_data = loader.Hangul
local syllable_index = codepoint - 0xAC00
return ("HANGUL SYLLABLE %s%s%s"):format(
Hangul_data.leads[floor(syllable_index / Hangul_data.final_count)],
Hangul_data.vowels[floor((syllable_index % Hangul_data.final_count)
/ Hangul_data.trail_count)],
Hangul_data.trails[syllable_index % Hangul_data.trail_count]
)
end },
-- High Surrogates, High Private Use Surrogates, Low Surrogates
{ 0xD800, 0xDFFF, "<surrogate-%04X>" },
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use
-- CJK Compatibility Ideographs
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{ 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{ 0x17000, 0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph
{ 0x18800, 0x18AFF, function (codepoint)
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF)
end },
{ 0x18D00, 0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu
{ 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
{ 0x2A700, 0x2B739, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
{ 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" },
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17)
end},
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{ 0x31350, 0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{ 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use
}
name_hooks.length = #name_hooks
local name_range_cache
local function generate_name(data, codepoint)
if type(data) == "string" then
return data:format(codepoint)
else
return data(codepoint)
end
end
--[[
-- Checks that the code point is a number and in range.
-- Does not check whether code point is an integer.
-- Not used
local function check_codepoint(funcName, argIdx, val)
require 'libraryUtil'.checkType(funcName, argIdx, val, 'number')
if codepoint < 0 or 0x10FFFF < codepoint then
errorf("Codepoint %04X out of range", codepoint)
end
end
--]]
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- (Cn) and specifically noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
or floor(codepoint % 0x10000) >= 0xFFFE) then
return ("<noncharacter-%04X>"):format(codepoint)
end
if name_range_cache -- Check if previously used "name hook" applies to this code point.
and codepoint >= name_range_cache[1]
and codepoint <= name_range_cache[2] then
return generate_name(name_range_cache[3], codepoint)
end
local range = binary_range_search(codepoint, name_hooks)
if range then
name_range_cache = range
return generate_name(range[3], codepoint)
end
local data = loader[('names/%03X'):format(codepoint / 0x1000)]
if data and data[codepoint] then
return data[codepoint]
-- Unassigned (Cn) consists of noncharacters and reserved characters.
-- The character has been established not to be a noncharacter,
-- and if it were assigned, its name would already been retrieved,
-- so it must be reserved.
else
return ("<reserved-%04X>"):format(codepoint)
end
end
--[[
-- No image data modules on Wikipedia yet.
function p.lookup_image(codepoint)
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
if data then
return data[codepoint]
end
end
--]]
local planes = {
[ 0] = "Basic Multilingual Plane";
[ 1] = "Supplementary Multilingual Plane";
[ 2] = "Supplementary Ideographic Plane";
[ 3] = "Tertiary Ideographic Plane";
[14] = "Supplementary Special-purpose Plane";
[15] = "Supplementary Private Use Area-A";
[16] = "Supplementary Private Use Area-B";
}
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable.
local blocks
local function block_iter(blocks, i)
i = i + 1
local data = blocks[i]
if data then
-- Unpack doesn't work on tables loaded with mw.loadData.
return i, data[1], data[2], data[3]
end
end
-- An ipairs-type iterator generator for the list of blocks.
function p.enum_blocks()
local blocks = loader.blocks
return block_iter, blocks, 0
end
function p.lookup_plane(codepoint)
local i = floor(codepoint / 0x10000)
return planes[i] or ("Plane %u"):format(i)
end
function p.lookup_block(codepoint)
local blocks = loader.blocks
local range = binary_range_search(codepoint, blocks)
if range then
return range[3]
else
return "No Block"
end
end
function p.get_block_info(name)
for i, block in ipairs(loader.blocks) do
if block[3] == name then
return block
end
end
end
function p.is_valid_pagename(pagename)
local has_nonws = false
for cp in mw.ustring.gcodepoint(pagename) do
if (cp == 0x0023) -- #
or (cp == 0x005B) -- [
or (cp == 0x005D) -- ]
or (cp == 0x007B) -- {
or (cp == 0x007C) -- |
or (cp == 0x007D) -- }
or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
then
return false
end
local printable, result = p.is_printable(cp)
if not printable then
return false
end
if result ~= "space-separator" then
has_nonws = true
end
end
return has_nonws
end
local function manual_unpack(what, from)
if what[from + 1] == nil then
return what[from]
end
local result = {}
from = from or 1
for i, item in ipairs(what) do
if i >= from then
table.insert(result, item)
end
end
return unpack(result)
end
local function compare_ranges(range1, range2)
return range1[1] < range2[1]
end
-- Creates a function to look up data in a module that contains "singles" (a
-- code point-to-data map) and "ranges" (an array containing arrays that contain
-- the low and high code points of a range and the data associated with that
-- range).
-- "loader" loads and returns the "singles" and "ranges" tables.
-- "match_func" is passed the code point and either the data or the "dots", and
-- generates the final result of the function.
-- The varargs ("dots") describes the default data to be returned if there wasn't
-- a match.
-- In case the function is used more than once, "cache" saves ranges that have
-- already been found to match, or a range whose data is the default if there
-- was no match.
local function memo_lookup(data_module_subpage, match_func, ...)
local dots = { ... }
local cache = {}
local singles, ranges
return function (codepoint)
if not singles then
local data_module = loader[data_module_subpage]
singles, ranges = data_module.singles, data_module.ranges
end
if singles[codepoint] then
return match_func(codepoint, singles[codepoint])
end
local range = binary_range_search(codepoint, cache)
if range then
return match_func(codepoint, manual_unpack(range, 3))
end
local range, index = binary_range_search(codepoint, ranges)
if range then
table.insert(cache, range)
table.sort(cache, compare_ranges)
return match_func(codepoint, manual_unpack(range, 3))
end
if ranges[index] then
local dots_range
if codepoint > ranges[index][2] then
dots_range = {
ranges[index][2] + 1,
ranges[index + 1] and ranges[index + 1][1] - 1 or 0x10FFFF,
unpack(dots)
}
else -- codepoint < range[index][1]
dots_range = {
ranges[index - 1] and ranges[index - 1][2] + 1 or 0,
ranges[index][1] - 1,
unpack(dots)
}
end
table.sort(cache, compare_ranges)
end
return match_func(codepoint)
end
end
-- Get a code point's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
p.is_combining = memo_lookup(
"combining",
function (codepoint, combining_class)
return combining_class and combining_class ~= 0 or false
end,
0)
function p.add_dotted_circle(str)
return (mw.ustring.gsub(str, ".",
function(char)
if p.is_combining(mw.ustring.codepoint(char)) then
return '◌' .. char
end
end))
end
local lookup_control = memo_lookup(
"control",
function (codepoint, ccc)
return ccc or "assigned"
end,
"assigned")
p.lookup_control = lookup_control
function p.is_assigned(codepoint)
return lookup_control(codepoint) ~= "unassigned"
end
function p.is_printable(codepoint)
local result = lookup_control(codepoint)
return (result == "assigned") or (result == "space-separator"), result
end
function p.is_whitespace(codepoint)
local result = lookup_control(codepoint)
return (result == "space-separator"), result
end
p.lookup_category = memo_lookup(
"category",
function (codepoint, category)
return category
end,
"Cn")
local lookup_script = memo_lookup(
"scripts",
function (codepoint, script_code)
return script_code or 'Zzzz'
end,
"Zzzz")
p.lookup_script = lookup_script
function p.get_best_script(str)
-- Check type of argument, because mw.text.decode coerces numbers to strings!
require "libraryUtil".checkType("get_best_script", 1, str, "string")
-- Convert HTML character references (including named character references,
-- or character entities) to characters.
str = mw.text.decode(str, true)
local scripts = {}
for codepoint in mw.ustring.gcodepoint(str) do
local script = lookup_script(codepoint)
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts.
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then
scripts[script] = true
end
end
-- If scripts does not contain two or more keys,
-- return first and only key (script code) in table.
if not next(scripts, next(scripts)) then
return next(scripts)
end -- else return majority script, or else "Zzzz"?
end
function p.is_Latin(str)
require "libraryUtil".checkType("get_best_script", 1, str, "string")
str = mw.text.decode(str, true)
-- Search for the leading bytes that introduce the UTF-8 encoding of the
-- code points U+0340-U+10FFFF. If they are not found and there is at least
-- one Latin-script character, the string counts as Latin, because the rest
-- of the characters can only be Zyyy, Zinh, and Zzzz.
-- The only scripts found below U+0370 (the first code point of the Greek
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz.
-- See the codepage in the [[UTF-8]] article.
if not str:find "[\205-\244]" then
for codepoint in mw.ustring.gcodepoint(str) do
if lookup_script(codepoint) == "Latn" then
return true
end
end
end
local Latn = false
for codepoint in mw.ustring.gcodepoint(str) do
local script = lookup_script(codepoint)
if script == "Latn" then
Latn = true
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
return false
end
end
return Latn
end
-- Checks that a string contains only characters belonging to right-to-left
-- scripts, or characters of ignorable scripts.
function p.is_rtl(str)
require "libraryUtil".checkType("get_best_script", 1, str, "string")
str = mw.text.decode(str, true)
-- Search for the leading bytes that introduce the UTF-8 encoding of the
-- code points U+0580-U+10FFFF. If they are not found, the string can only
-- have characters from a left-to-right script, because the first code point
-- in a right-to-left script is U+0591, in the Hebrew block.
if not str:find "[\214-\244]" then
return false
end
local result = false
local rtl = loader.scripts.rtl
for codepoint in mw.ustring.gcodepoint(str) do
local script = lookup_script(codepoint)
if rtl[script] then
result = true
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
return false
end
end
return result
end
local function get_codepoint(args, arg)
local codepoint_string = args[arg]
or errorf(2, "Parameter %s is required", tostring(arg))
local codepoint = tonumber(codepoint_string, 16)
or errorf(2, "Parameter %s is not a code point in hexadecimal base",
tostring(arg))
if not (0 <= codepoint and codepoint <= 0x10FFFF) then
errorf(2, "code point in parameter %s out of range", tostring(arg))
end
return codepoint
end
local function get_func(args, arg, prefix)
local suffix = args[arg]
or errorf(2, "Parameter %s is required", tostring(arg))
suffix = mw.text.trim(suffix)
local func_name = prefix .. suffix
local func = p[func_name]
or errorf(2, "There is no function '%s'", func_name)
return func
end
-- This function allows any of the "lookup" functions to be invoked. The first
-- parameter is the word after "lookup_"; the second parameter is the code point
-- in hexadecimal base.
function p.lookup(frame)
local func = get_func(frame.args, 1, "lookup_")
local codepoint = get_codepoint(frame.args, 2)
local result = func(codepoint)
if func == p.lookup_name then
-- Prevent code point labels such as <control-0000> from being
-- interpreted as HTML tags.
result = result:gsub("<", "<")
end
return result
end
function p.is(frame)
local func = get_func(frame.args, 1, "is_")
-- is_Latin and is_valid_pagename take strings.
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then
return (func(frame.args[2]))
else -- The rest take code points.
local codepoint = get_codepoint(frame.args, 2)
return (func(codepoint)) -- Adjust to one result.
end
end
return p