Module:Unicode data
From Vigyanwiki
Documentation for this module may be created at Module:Unicode data/doc
local export = {}
local floor = math.floor
-- The following leads, vowels, and trails come from here:
-- http://www.unicode.org/Public/UNIDATA/Jamo.txt
-- For the algorithm used to generate Hangul Syllable names,
-- see "Hangul Syllable Name Generation" in section 3.12 of the
-- Unicode Specification:
-- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
local hangul_leads = {
[0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS",
"", "J", "JJ", "C", "K", "T", "P", "H"
}
hangul_leads.length = #hangul_leads + 1
local hangul_vowels = {
[0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
"I"
}
hangul_vowel_count = #hangul_vowels + 1
local hangul_trails = {
[0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB",
"LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K",
"T", "P", "H"
}
hangul_trail_count = #hangul_trails + 1
hangul_codas = hangul_vowel_count * hangul_trail_count
local name_hooks = {
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters
{ 0x7F, 0x9F, "<control-%04X>" }, -- DEL and C1 control characters
{ 0x3400, 0x4DB5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
{ 0x4E00, 0x9FEF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph --change v10
{ 0xAC00, 0xD7A3, function (codepoint)
local syllable_index = codepoint - 0xAC00
return ("HANGUL SYLLABLE %s%s%s"):format(
hangul_leads[floor(syllable_index / hangul_codas)],
hangul_vowels[floor((syllable_index % hangul_codas) / hangul_trail_count)],
hangul_trails[syllable_index % hangul_trail_count]
)
end },
{ 0xD800, 0xDB7F, "<surrogate-%04X>" }, -- Non Private Use High Surrogate
{ 0xDB80, 0xDBFF, "<surrogate-%04X>" }, -- Private Use High Surrogate
{ 0xDC00, 0xDFFF, "<surrogate-%04X>" }, -- Low Surrogate
{ 0xE000, 0xF8FF, "<private-use-%04X>" }, -- Private Use
{ 0x17000, 0x187F1, "TANGUT IDEOGRAPH-%05X" }, -- Tangut
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%05X" }, -- Nushu --add v10
{ 0x20000, 0x2A6D6, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension B
{ 0x2A700, 0x2B734, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension C
{ 0x2A740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension D
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension E
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%05X" }, -- CJK Ideograph Extension F --add v10
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%05X" }, -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
{ 0xF0000, 0xFFFFD, "<private-use-%05X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%06X>" } -- Plane 16 Private Use
}
local name_range_cache
local function generate_name(data, codepoint)
if type(data) == "string" then
return data:format(codepoint)
else
return data(codepoint)
end
end
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function export.lookup_name(codepoint)
-- U+FDD0-U+FDEF and all codepoints ending in FFFE or FFFF are noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
or math.floor(codepoint % 0x10000) >= 0xFFFE) then
return ("<noncharacter-%04X>"):format(codepoint)
end
if name_range_cache then
if (codepoint >= name_range_cache[1]) and (codepoint <= name_range_cache[2]) then
return generate_name(name_range_cache[3], codepoint)
end
end
for _, item in ipairs(name_hooks) do
if codepoint < item[1] then
break
elseif codepoint <= item[2] then
name_range_cache = item
return generate_name(item[3], codepoint)
end
end
local success, data = pcall(mw.loadData,
('Module:Unicode data/names/%03X'):format(codepoint / 0x1000))
if success and data[codepoint] then
return data[codepoint]
-- Unassigned (Cn) includes noncharacters and reserved characters.
-- The character is not a noncharacter and if it were assigned, its name
-- would already been retrieved, so it must be reserved.
else
return ("<reserved-%04X>"):format(codepoint)
end
end
function export.lookup_image(codepoint)
local success, data = pcall(mw.loadData,
('Module:Unicode data/images/%03X'):format(codepoint / 0x1000)
)
if success then
return data[codepoint]
end
end
function export.template_lookup_name(frame)
local codepoint = tonumber(frame.args[1] or frame:getParent().args[1])
local name = export.lookup_name(codepoint)
return name:gsub("<", "<")
end
local planes = {
[ 0] = "Basic Multilingual Plane";
[ 1] = "Supplementary Multilingual Plane";
[ 2] = "Supplementary Ideographic Plane";
[13] = "Supplementary Special-purpose Plane";
[14] = "Supplementary Private Use Area-A";
[15] = "Supplementary Private Use Area-B";
}
-- http://www.unicode.org/Public/UNIDATA/Blocks.txt
-- This should be kept synchronized with [[Module:category tree/scriptcatboiler/blocks]].
local blocks = {
{ "Basic Latin", 0x000000, 0x00007F },
{ "Latin-1 Supplement", 0x000080, 0x0000FF },
{ "Latin Extended-A", 0x000100, 0x00017F },
{ "Latin Extended-B", 0x000180, 0x00024F },
{ "IPA Extensions", 0x000250, 0x0002AF },
{ "Spacing Modifier Letters", 0x0002B0, 0x0002FF },
{ "Combining Diacritical Marks", 0x000300, 0x00036F },
{ "Greek and Coptic", 0x000370, 0x0003FF },
{ "Cyrillic", 0x000400, 0x0004FF },
{ "Cyrillic Supplement", 0x000500, 0x00052F },
{ "Armenian", 0x000530, 0x00058F },
{ "Hebrew", 0x000590, 0x0005FF },
{ "Arabic", 0x000600, 0x0006FF },
{ "Syriac", 0x000700, 0x00074F },
{ "Arabic Supplement", 0x000750, 0x00077F },
{ "Thaana", 0x000780, 0x0007BF },
{ "NKo", 0x0007C0, 0x0007FF },
{ "Samaritan", 0x000800, 0x00083F },
{ "Mandaic", 0x000840, 0x00085F },
{ "Syriac Supplement", 0x000860, 0x00086F },
{ "Arabic Extended-A", 0x0008A0, 0x0008FF },
{ "Devanagari", 0x000900, 0x00097F },
{ "Bengali", 0x000980, 0x0009FF },
{ "Gurmukhi", 0x000A00, 0x000A7F },
{ "Gujarati", 0x000A80, 0x000AFF },
{ "Oriya", 0x000B00, 0x000B7F },
{ "Tamil", 0x000B80, 0x000BFF },
{ "Telugu", 0x000C00, 0x000C7F },
{ "Kannada", 0x000C80, 0x000CFF },
{ "Malayalam", 0x000D00, 0x000D7F },
{ "Sinhala", 0x000D80, 0x000DFF },
{ "Thai", 0x000E00, 0x000E7F },
{ "Lao", 0x000E80, 0x000EFF },
{ "Tibetan", 0x000F00, 0x000FFF },
{ "Myanmar", 0x001000, 0x00109F },
{ "Georgian", 0x0010A0, 0x0010FF },
{ "Hangul Jamo", 0x001100, 0x0011FF },
{ "Ethiopic", 0x001200, 0x00137F },
{ "Ethiopic Supplement", 0x001380, 0x00139F },
{ "Cherokee", 0x0013A0, 0x0013FF },
{ "Unified Canadian Aboriginal Syllabics", 0x001400, 0x00167F },
{ "Ogham", 0x001680, 0x00169F },
{ "Runic", 0x0016A0, 0x0016FF },
{ "Tagalog", 0x001700, 0x00171F },
{ "Hanunoo", 0x001720, 0x00173F },
{ "Buhid", 0x001740, 0x00175F },
{ "Tagbanwa", 0x001760, 0x00177F },
{ "Khmer", 0x001780, 0x0017FF },
{ "Mongolian", 0x001800, 0x0018AF },
{ "Unified Canadian Aboriginal Syllabics Extended", 0x0018B0, 0x0018FF },
{ "Limbu", 0x001900, 0x00194F },
{ "Tai Le", 0x001950, 0x00197F },
{ "New Tai Lue", 0x001980, 0x0019DF },
{ "Khmer Symbols", 0x0019E0, 0x0019FF },
{ "Buginese", 0x001A00, 0x001A1F },
{ "Tai Tham", 0x001A20, 0x001AAF },
{ "Combining Diacritical Marks Extended", 0x001AB0, 0x001AFF },
{ "Balinese", 0x001B00, 0x001B7F },
{ "Sundanese", 0x001B80, 0x001BBF },
{ "Batak", 0x001BC0, 0x001BFF },
{ "Lepcha", 0x001C00, 0x001C4F },
{ "Ol Chiki", 0x001C50, 0x001C7F },
{ "Cyrillic Extended-C", 0x001C80, 0x001C8F },
{ "Georgian Extended", 0x001C90, 0x001CBF },
{ "Sundanese Supplement", 0x001CC0, 0x001CCF },
{ "Vedic Extensions", 0x001CD0, 0x001CFF },
{ "Phonetic Extensions", 0x001D00, 0x001D7F },
{ "Phonetic Extensions Supplement", 0x001D80, 0x001DBF },
{ "Combining Diacritical Marks Supplement", 0x001DC0, 0x001DFF },
{ "Latin Extended Additional", 0x001E00, 0x001EFF },
{ "Greek Extended", 0x001F00, 0x001FFF },
{ "General Punctuation", 0x002000, 0x00206F },
{ "Superscripts and Subscripts", 0x002070, 0x00209F },
{ "Currency Symbols", 0x0020A0, 0x0020CF },
{ "Combining Diacritical Marks for Symbols", 0x0020D0, 0x0020FF },
{ "Letterlike Symbols", 0x002100, 0x00214F },
{ "Number Forms", 0x002150, 0x00218F },
{ "Arrows", 0x002190, 0x0021FF },
{ "Mathematical Operators", 0x002200, 0x0022FF },
{ "Miscellaneous Technical", 0x002300, 0x0023FF },
{ "Control Pictures", 0x002400, 0x00243F },
{ "Optical Character Recognition", 0x002440, 0x00245F },
{ "Enclosed Alphanumerics", 0x002460, 0x0024FF },
{ "Box Drawing", 0x002500, 0x00257F },
{ "Block Elements", 0x002580, 0x00259F },
{ "Geometric Shapes", 0x0025A0, 0x0025FF },
{ "Miscellaneous Symbols", 0x002600, 0x0026FF },
{ "Dingbats", 0x002700, 0x0027BF },
{ "Miscellaneous Mathematical Symbols-A", 0x0027C0, 0x0027EF },
{ "Supplemental Arrows-A", 0x0027F0, 0x0027FF },
{ "Braille Patterns", 0x002800, 0x0028FF },
{ "Supplemental Arrows-B", 0x002900, 0x00297F },
{ "Miscellaneous Mathematical Symbols-B", 0x002980, 0x0029FF },
{ "Supplemental Mathematical Operators", 0x002A00, 0x002AFF },
{ "Miscellaneous Symbols and Arrows", 0x002B00, 0x002BFF },
{ "Glagolitic", 0x002C00, 0x002C5F },
{ "Latin Extended-C", 0x002C60, 0x002C7F },
{ "Coptic", 0x002C80, 0x002CFF },
{ "Georgian Supplement", 0x002D00, 0x002D2F },
{ "Tifinagh", 0x002D30, 0x002D7F },
{ "Ethiopic Extended", 0x002D80, 0x002DDF },
{ "Cyrillic Extended-A", 0x002DE0, 0x002DFF },
{ "Supplemental Punctuation", 0x002E00, 0x002E7F },
{ "CJK Radicals Supplement", 0x002E80, 0x002EFF },
{ "Kangxi Radicals", 0x002F00, 0x002FDF },
{ "Ideographic Description Characters", 0x002FF0, 0x002FFF },
{ "CJK Symbols and Punctuation", 0x003000, 0x00303F },
{ "Hiragana", 0x003040, 0x00309F },
{ "Katakana", 0x0030A0, 0x0030FF },
{ "Bopomofo", 0x003100, 0x00312F },
{ "Hangul Compatibility Jamo", 0x003130, 0x00318F },
{ "Kanbun", 0x003190, 0x00319F },
{ "Bopomofo Extended", 0x0031A0, 0x0031BF },
{ "CJK Strokes", 0x0031C0, 0x0031EF },
{ "Katakana Phonetic Extensions", 0x0031F0, 0x0031FF },
{ "Enclosed CJK Letters and Months", 0x003200, 0x0032FF },
{ "CJK Compatibility", 0x003300, 0x0033FF },
{ "CJK Unified Ideographs Extension A", 0x003400, 0x004DBF },
{ "Yijing Hexagram Symbols", 0x004DC0, 0x004DFF },
{ "CJK Unified Ideographs", 0x004E00, 0x009FFF },
{ "Yi Syllables", 0x00A000, 0x00A48F },
{ "Yi Radicals", 0x00A490, 0x00A4CF },
{ "Lisu", 0x00A4D0, 0x00A4FF },
{ "Vai", 0x00A500, 0x00A63F },
{ "Cyrillic Extended-B", 0x00A640, 0x00A69F },
{ "Bamum", 0x00A6A0, 0x00A6FF },
{ "Modifier Tone Letters", 0x00A700, 0x00A71F },
{ "Latin Extended-D", 0x00A720, 0x00A7FF },
{ "Syloti Nagri", 0x00A800, 0x00A82F },
{ "Common Indic Number Forms", 0x00A830, 0x00A83F },
{ "Phags-pa", 0x00A840, 0x00A87F },
{ "Saurashtra", 0x00A880, 0x00A8DF },
{ "Devanagari Extended", 0x00A8E0, 0x00A8FF },
{ "Kayah Li", 0x00A900, 0x00A92F },
{ "Rejang", 0x00A930, 0x00A95F },
{ "Hangul Jamo Extended-A", 0x00A960, 0x00A97F },
{ "Javanese", 0x00A980, 0x00A9DF },
{ "Myanmar Extended-B", 0x00A9E0, 0x00A9FF },
{ "Cham", 0x00AA00, 0x00AA5F },
{ "Myanmar Extended-A", 0x00AA60, 0x00AA7F },
{ "Tai Viet", 0x00AA80, 0x00AADF },
{ "Meetei Mayek Extensions", 0x00AAE0, 0x00AAFF },
{ "Ethiopic Extended-A", 0x00AB00, 0x00AB2F },
{ "Latin Extended-E", 0x00AB30, 0x00AB6F },
{ "Cherokee Supplement", 0x00AB70, 0x00ABBF },
{ "Meetei Mayek", 0x00ABC0, 0x00ABFF },
{ "Hangul Syllables", 0x00AC00, 0x00D7AF },
{ "Hangul Jamo Extended-B", 0x00D7B0, 0x00D7FF },
{ "High Surrogates", 0x00D800, 0x00DB7F },
{ "High Private Use Surrogates", 0x00DB80, 0x00DBFF },
{ "Low Surrogates", 0x00DC00, 0x00DFFF },
{ "Private Use Area", 0x00E000, 0x00F8FF },
{ "CJK Compatibility Ideographs", 0x00F900, 0x00FAFF },
{ "Alphabetic Presentation Forms", 0x00FB00, 0x00FB4F },
{ "Arabic Presentation Forms-A", 0x00FB50, 0x00FDFF },
{ "Variation Selectors", 0x00FE00, 0x00FE0F },
{ "Vertical Forms", 0x00FE10, 0x00FE1F },
{ "Combining Half Marks", 0x00FE20, 0x00FE2F },
{ "CJK Compatibility Forms", 0x00FE30, 0x00FE4F },
{ "Small Form Variants", 0x00FE50, 0x00FE6F },
{ "Arabic Presentation Forms-B", 0x00FE70, 0x00FEFF },
{ "Halfwidth and Fullwidth Forms", 0x00FF00, 0x00FFEF },
{ "Specials", 0x00FFF0, 0x00FFFF },
{ "Linear B Syllabary", 0x010000, 0x01007F },
{ "Linear B Ideograms", 0x010080, 0x0100FF },
{ "Aegean Numbers", 0x010100, 0x01013F },
{ "Ancient Greek Numbers", 0x010140, 0x01018F },
{ "Ancient Symbols", 0x010190, 0x0101CF },
{ "Phaistos Disc", 0x0101D0, 0x0101FF },
{ "Lycian", 0x010280, 0x01029F },
{ "Carian", 0x0102A0, 0x0102DF },
{ "Coptic Epact Numbers", 0x0102E0, 0x0102FF },
{ "Old Italic", 0x010300, 0x01032F },
{ "Gothic", 0x010330, 0x01034F },
{ "Old Permic", 0x010350, 0x01037F },
{ "Ugaritic", 0x010380, 0x01039F },
{ "Old Persian", 0x0103A0, 0x0103DF },
{ "Deseret", 0x010400, 0x01044F },
{ "Shavian", 0x010450, 0x01047F },
{ "Osmanya", 0x010480, 0x0104AF },
{ "Osage", 0x0104B0, 0x0104FF },
{ "Elbasan", 0x010500, 0x01052F },
{ "Caucasian Albanian", 0x010530, 0x01056F },
{ "Linear A", 0x010600, 0x01077F },
{ "Cypriot Syllabary", 0x010800, 0x01083F },
{ "Imperial Aramaic", 0x010840, 0x01085F },
{ "Palmyrene", 0x010860, 0x01087F },
{ "Nabataean", 0x010880, 0x0108AF },
{ "Hatran", 0x0108E0, 0x0108FF },
{ "Phoenician", 0x010900, 0x01091F },
{ "Lydian", 0x010920, 0x01093F },
{ "Meroitic Hieroglyphs", 0x010980, 0x01099F },
{ "Meroitic Cursive", 0x0109A0, 0x0109FF },
{ "Kharoshthi", 0x010A00, 0x010A5F },
{ "Old South Arabian", 0x010A60, 0x010A7F },
{ "Old North Arabian", 0x010A80, 0x010A9F },
{ "Manichaean", 0x010AC0, 0x010AFF },
{ "Avestan", 0x010B00, 0x010B3F },
{ "Inscriptional Parthian", 0x010B40, 0x010B5F },
{ "Inscriptional Pahlavi", 0x010B60, 0x010B7F },
{ "Psalter Pahlavi", 0x010B80, 0x010BAF },
{ "Old Turkic", 0x010C00, 0x010C4F },
{ "Old Hungarian", 0x010C80, 0x010CFF },
{ "Hanifi Rohingya", 0x010D00, 0x010D3F },
{ "Rumi Numeral Symbols", 0x010E60, 0x010E7F },
{ "Old Sogdian", 0x010F00, 0x010F2F },
{ "Sogdian", 0x010F30, 0x010F6F },
{ "Brahmi", 0x011000, 0x01107F },
{ "Kaithi", 0x011080, 0x0110CF },
{ "Sora Sompeng", 0x0110D0, 0x0110FF },
{ "Chakma", 0x011100, 0x01114F },
{ "Mahajani", 0x011150, 0x01117F },
{ "Sharada", 0x011180, 0x0111DF },
{ "Sinhala Archaic Numbers", 0x0111E0, 0x0111FF },
{ "Khojki", 0x011200, 0x01124F },
{ "Multani", 0x011280, 0x0112AF },
{ "Khudawadi", 0x0112B0, 0x0112FF },
{ "Grantha", 0x011300, 0x01137F },
{ "Newa", 0x011400, 0x01147F },
{ "Tirhuta", 0x011480, 0x0114DF },
{ "Siddham", 0x011580, 0x0115FF },
{ "Modi", 0x011600, 0x01165F },
{ "Mongolian Supplement", 0x011660, 0x01167F },
{ "Takri", 0x011680, 0x0116CF },
{ "Ahom", 0x011700, 0x01173F },
{ "Dogra", 0x011800, 0x01184F },
{ "Warang Citi", 0x0118A0, 0x0118FF },
{ "Zanabazar Square", 0x011A00, 0x011A4F },
{ "Soyombo", 0x011A50, 0x011AAF },
{ "Pau Cin Hau", 0x011AC0, 0x011AFF },
{ "Bhaiksuki", 0x011C00, 0x011C6F },
{ "Marchen", 0x011C70, 0x011CBF },
{ "Masaram Gondi", 0x011D00, 0x011D5F },
{ "Gunjala Gondi", 0x011D60, 0x011DAF },
{ "Makasar", 0x011EE0, 0x011EFF },
{ "Cuneiform", 0x012000, 0x0123FF },
{ "Cuneiform Numbers and Punctuation", 0x012400, 0x01247F },
{ "Early Dynastic Cuneiform", 0x012480, 0x01254F },
{ "Egyptian Hieroglyphs", 0x013000, 0x01342F },
{ "Anatolian Hieroglyphs", 0x014400, 0x01467F },
{ "Bamum Supplement", 0x016800, 0x016A3F },
{ "Mro", 0x016A40, 0x016A6F },
{ "Bassa Vah", 0x016AD0, 0x016AFF },
{ "Pahawh Hmong", 0x016B00, 0x016B8F },
{ "Medefaidrin", 0x016E40, 0x016E9F },
{ "Miao", 0x016F00, 0x016F9F },
{ "Ideographic Symbols and Punctuation", 0x016FE0, 0x016FFF },
{ "Tangut", 0x017000, 0x0187FF },
{ "Tangut Components", 0x018800, 0x018AFF },
{ "Kana Supplement", 0x01B000, 0x01B0FF },
{ "Kana Extended-A", 0x01B100, 0x01B12F },
{ "Nushu", 0x01B170, 0x01B2FF },
{ "Duployan", 0x01BC00, 0x01BC9F },
{ "Shorthand Format Controls", 0x01BCA0, 0x01BCAF },
{ "Byzantine Musical Symbols", 0x01D000, 0x01D0FF },
{ "Musical Symbols", 0x01D100, 0x01D1FF },
{ "Ancient Greek Musical Notation", 0x01D200, 0x01D24F },
{ "Mayan Numerals", 0x01D2E0, 0x01D2FF },
{ "Tai Xuan Jing Symbols", 0x01D300, 0x01D35F },
{ "Counting Rod Numerals", 0x01D360, 0x01D37F },
{ "Mathematical Alphanumeric Symbols", 0x01D400, 0x01D7FF },
{ "Sutton SignWriting", 0x01D800, 0x01DAAF },
{ "Glagolitic Supplement", 0x01E000, 0x01E02F },
{ "Mende Kikakui", 0x01E800, 0x01E8DF },
{ "Adlam", 0x01E900, 0x01E95F },
{ "Indic Siyaq Numbers", 0x01EC70, 0x01ECBF },
{ "Arabic Mathematical Alphabetic Symbols", 0x01EE00, 0x01EEFF },
{ "Mahjong Tiles", 0x01F000, 0x01F02F },
{ "Domino Tiles", 0x01F030, 0x01F09F },
{ "Playing Cards", 0x01F0A0, 0x01F0FF },
{ "Enclosed Alphanumeric Supplement", 0x01F100, 0x01F1FF },
{ "Enclosed Ideographic Supplement", 0x01F200, 0x01F2FF },
{ "Miscellaneous Symbols and Pictographs", 0x01F300, 0x01F5FF },
{ "Emoticons", 0x01F600, 0x01F64F },
{ "Ornamental Dingbats", 0x01F650, 0x01F67F },
{ "Transport and Map Symbols", 0x01F680, 0x01F6FF },
{ "Alchemical Symbols", 0x01F700, 0x01F77F },
{ "Geometric Shapes Extended", 0x01F780, 0x01F7FF },
{ "Supplemental Arrows-C", 0x01F800, 0x01F8FF },
{ "Supplemental Symbols and Pictographs", 0x01F900, 0x01F9FF },
{ "Chess Symbols", 0x01FA00, 0x01FA6F },
{ "CJK Unified Ideographs Extension B", 0x020000, 0x02A6DF },
{ "CJK Unified Ideographs Extension C", 0x02A700, 0x02B73F },
{ "CJK Unified Ideographs Extension D", 0x02B740, 0x02B81F },
{ "CJK Unified Ideographs Extension E", 0x02B820, 0x02CEAF },
{ "CJK Unified Ideographs Extension F", 0x02CEB0, 0x02EBEF },
{ "CJK Compatibility Ideographs Supplement", 0x02F800, 0x02FA1F },
{ "Tags", 0x0E0000, 0x0E007F },
{ "Variation Selectors Supplement", 0x0E0100, 0x0E01EF },
{ "Supplementary Private Use Area-A", 0x0F0000, 0x0FFFFF },
{ "Supplementary Private Use Area-B", 0x100000, 0x10FFFF },
}
blocks.length = #blocks
function export.enum_blocks()
return function (blocks, i)
i = i + 1
local data = blocks[i]
if not data then
return nil
end
return i, unpack(data)
end, blocks, 0
end
function export.lookup_plane(codepoint)
local i = floor(codepoint / 0x10000)
return planes[i] or ("Plane %u"):format(i)
end
-- Binary search, to avoid iterating over entire table in order to look up the
-- higher codepoints.
function export.lookup_block(codepoint)
local iStart, iEnd = 1, blocks.length or #blocks
while iStart <= iEnd do
local iMid = floor((iStart + iEnd) / 2)
local range = blocks[iMid]
if codepoint < range[2] then
iEnd = iMid - 1
elseif codepoint <= range[3] then
return range[1]
else
iStart = iMid + 1
end
end
error(string.format("No block found for codepoint U+%04X.", codepoint))
end
function export.get_block_range(name)
local range
for i, block in ipairs(blocks) do
if block[1] == name then
range = block
end
end
if range then
return range[2], range[3]
end
end
function export.is_valid_pagename(pagename)
local has_nonws = false
for cp in mw.ustring.gcodepoint(pagename) do
if (cp == 0x0023) -- #
or (cp == 0x005B) -- [
or (cp == 0x005D) -- ]
or (cp == 0x007B) -- {
or (cp == 0x007C) -- |
or (cp == 0x007D) -- }
or (cp == 0x180E) -- MONGOLIAN VOWEL SEPARATOR
or ((cp >= 0x2000) and (cp <= 0x200A)) -- spaces in General Punctuation block
or (cp == 0xFFFD) -- REPLACEMENT CHARACTER
then
return false
end
local printable, result = export.is_printable(cp)
if not printable then
return false
end
if result ~= "space-separator" then
has_nonws = true
end
end
return has_nonws
end
local function manual_unpack(what, from)
local result = {}
from = from or 1
for i, item in ipairs(what) do
if i >= from then
table.insert(result, item)
end
end
return unpack(result)
end
local function memo_lookup(loader, match_func, ...)
local dots = { ... }
local cache = {}
local singles, ranges
return function (codepoint)
if not singles then
singles, ranges = loader()
end
if singles[codepoint] then
return match_func(codepoint, singles[codepoint])
end
local lastlast = -1
for _, range in pairs(cache) do
if (range[1] <= codepoint) and (codepoint <= range[2]) then
return match_func(codepoint, unpack(range, 3))
end
end
for _, range in pairs(ranges) do
if codepoint < range[1] then
table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) })
return match_func(codepoint, unpack(dots))
elseif codepoint <= range[2] then
table.insert(cache, { manual_unpack(range) })
return match_func(codepoint, manual_unpack(range, 3))
else
lastlast = range[2]
end
end
return match_func(codepoint)
end
end
-- Get a codepoint's combining class value in [[Module:Unicode data/combining]],
-- and return whether this value is not zero. Zero is assigned as the default
-- if the combining class value is not found in this data module.
-- That is, return true if character is combining, or false if it is not.
-- See http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for
-- more information.
export.is_combining = memo_lookup(function ()
local m_comb = mw.loadData('Module:Unicode data/combining')
return m_comb.single, m_comb.ranges
end, function (codepoint, combining_class)
return combining_class and combining_class ~= 0
or false
end, 0)
function export.add_dotted_circle(str)
return (mw.ustring.gsub(str, ".",
function(char)
if export.is_combining(mw.ustring.codepoint(char)) then
return '◌' .. char
end
end))
end
local lookup_control = memo_lookup(function ()
local m_cc = mw.loadData('Module:Unicode data/control')
return m_cc.single, m_cc.ranges
end, function (codepoint, ccc)
return ccc or "assigned"
end, "assigned")
function export.is_assigned(codepoint)
return lookup_control(codepoint) ~= "unassigned"
end
function export.is_printable(codepoint)
local result = lookup_control(codepoint)
return (result == "assigned") or (result == "space-separator"), result
end
function export.is_whitespace(codepoint)
local result = lookup_control(codepoint)
return (result == "space-separator"), result
end
-- to be used in language-neutral context only (e.g. character lists)
local script_pats
-- Scripts that consist entirely of characters from another script.
local script_blacklist = {
["Latf"] = true;
["Hans"] = true;
["Hant"] = true;
["Kore"] = true;
["Jpan"] = true;
["fa-Arab"] = true;
["kk-Arab"] = true;
["ks-Arab"] = true;
["ku-Arab"] = true;
["mzn-Arab"] = true;
["ota-Arab"] = true;
["pa-Arab"] = true;
["ps-Arab"] = true;
["sd-Arab"] = true;
["tt-Arab"] = true;
["ug-Arab"] = true;
["ur-Arab"] = true;
["nv-Latn"] = true;
["pjt-Latn"] = true;
["Zyyy"] = true;
}
--[[
Problem scripts: Grek and polytonic, Cyrl and Cyrs, Latn and Latinx.
In each key-value pair, the value should take precedence over the key.
]]
local overridden_by = {
["Cyrs"] = "Cyrl",
["polytonic"] = "Grek",
["Latinx"] = "Latn",
}
local script_cache = {}
function export.get_script(codepoint)
local text
if type(codepoint) == "number" then
text = mw.ustring.char(codepoint)
elseif type(codepoint) == "string" then
text = codepoint
else
error("Argument to get_script should be a number (codepoint) or string.")
end
for pat, sc in pairs(script_cache) do
if mw.ustring.match(text, pat) and not overridden_by[sc] then
return sc
end
end
if not script_pats then
local m_scripts = mw.loadData("Module:scripts/data")
script_pats = {}
for sc, info in pairs(m_scripts) do
if info.characters and not script_blacklist[sc] then
script_pats[sc] = "[" .. info.characters .. "]"
end
end
end
for sc, pat in pairs(script_pats) do
if mw.ustring.match(text, pat) then
local overriding = overridden_by[sc]
if overriding and script_pats[overriding] and mw.ustring.match(text, script_pats[overriding]) then
script_cache[script_pats[overriding]] = overriding
return overriding
else
script_cache[pat] = sc
return sc
end
end
end
return "None"
end
local function sortRange(range1, range2)
return range1[1] < range2[1]
end
--[[
Binary search: more efficient for the longer lists of codepoint ranges than
for the shorter ones.
]]
local function binary_search(ranges, value)
if not ranges then
return nil
end
-- Initialize numbers.
local iStart, iMid = 1, 0
-- Can't use # because table is loaded by mw.loadData.
local iEnd = ranges.length or require("Module:table").size(ranges)
if iEnd == 0 then
return nil
end
local iterations = 0
-- Do search.
while iStart <= iEnd do
iterations = iterations + 1
-- Calculate middle.
iMid = floor((iStart + iEnd) / 2)
-- Get compare value.
local range = ranges[iMid]
if range[1] > value then
iEnd = iMid - 1
-- Return matching index. Assumes there are no duplicates.
elseif value <= range[2] then
return range
-- Keep searching.
else
iStart = iMid + 1
end
end
return nil
end
local function look_up_in_order(number, ranges)
for i, range in ipairs(ranges) do
if number < range[1] then
return nil
elseif number <= range[2] then
return range[3]
end
end
end
-- Save previously used codepoint ranges in case another character is in the
-- same range.
local ranges_cache = {}
--[=[
Takes a codepoint or a character and finds the script code (if any) that is
appropriate for it based on the codepoint, using the data module
[[Module:Unicode data/scripts]]. The data module was generated from the
patterns in [[Module:scripts/data]] using [[Module:User:Erutuon/script recognition]].
Converts the character to a codepoint. Returns a script code if the codepoint
is in the list of individual characters, or if it is in one of the defined
ranges in the 4096-character block that it belongs to, else returns "None".
]=]
function export.char_to_script(char)
local lookup = mw.loadData("Module:Unicode data/scripts")
local t = type(char)
local codepoint
if t == "string" then
local etc
codepoint, etc = mw.ustring.codepoint(char)
if etc then
error("Argument to char_to_script should be a single character.")
end
elseif t == "number" then
codepoint = char
else
error("Argument to char_to_script should be a string or a number, but its type is " .. t .. ".")
end
local individual_match = lookup.individual[codepoint]
if individual_match then
return individual_match
else
local script = look_up_in_order(codepoint, ranges_cache)
if script then
return script
end
local index = floor(codepoint / 0x1000)
script = look_up_in_order(index, lookup.blocks)
if script then
return script
end
local range = binary_search(lookup[index], codepoint)
if range then
table.insert(ranges_cache, range)
table.sort(ranges_cache, sortRange)
return range[3]
end
end
return "None"
end
function export.find_best_script(text)
local scripts = {}
for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do
local script = export.char_to_script(character)
scripts[script] = (scripts[script] or 0) + 1
end
local best_script
local greatest_count = 0
for script, count in pairs(scripts) do
if count > greatest_count then
best_script = script
greatest_count = count
end
end
return best_script
end
local unsupported_title = {
[0x0020] = "Unsupported titles/Space";
[0x0023] = "Unsupported titles/Number sign";
[0x002E] = "Unsupported titles/Full stop";
[0x003A] = "Unsupported titles/Colon";
[0x003C] = "Unsupported titles/Less than";
[0x003E] = "Unsupported titles/Greater than";
[0x005B] = "Unsupported titles/Left square bracket";
[0x005D] = "Unsupported titles/Right square bracket";
[0x005F] = "Unsupported titles/Low line";
[0x007B] = "Unsupported titles/Left curly bracket";
[0x007C] = "Unsupported titles/Vertical line";
[0x007D] = "Unsupported titles/Right curly bracket";
[0x1680] = "Unsupported titles/Ogham space";
[0xFFFD] = "Unsupported titles/Replacement character";
}
function export.get_entry_title(codepoint)
if unsupported_title[codepoint] then
return unsupported_title[codepoint]
end
if lookup_control(codepoint) ~= "assigned" then
return nil
end
return mw.ustring.char(codepoint)
end
return export