summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/webkit/WebCore/platform/text/UnicodeRange.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/3rdparty/webkit/WebCore/platform/text/UnicodeRange.cpp')
-rw-r--r--src/3rdparty/webkit/WebCore/platform/text/UnicodeRange.cpp462
1 files changed, 462 insertions, 0 deletions
diff --git a/src/3rdparty/webkit/WebCore/platform/text/UnicodeRange.cpp b/src/3rdparty/webkit/WebCore/platform/text/UnicodeRange.cpp
new file mode 100644
index 0000000..0373441
--- /dev/null
+++ b/src/3rdparty/webkit/WebCore/platform/text/UnicodeRange.cpp
@@ -0,0 +1,462 @@
+/*
+ * Copyright (C) 2007 Apple Computer, Inc.
+ *
+ * Portions are Copyright (C) 1998 Netscape Communications Corporation.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Alternatively, the contents of this file may be used under the terms
+ * of either the Mozilla Public License Version 1.1, found at
+ * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public
+ * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html
+ * (the "GPL"), in which case the provisions of the MPL or the GPL are
+ * applicable instead of those above. If you wish to allow use of your
+ * version of this file only under the terms of one of those two
+ * licenses (the MPL or the GPL) and not to allow others to use your
+ * version of this file under the LGPL, indicate your decision by
+ * deletingthe provisions above and replace them with the notice and
+ * other provisions required by the MPL or the GPL, as the case may be.
+ * If you do not delete the provisions above, a recipient may use your
+ * version of this file under any of the LGPL, the MPL or the GPL.
+ */
+
+#include "config.h"
+#include "UnicodeRange.h"
+
+namespace WebCore {
+
+// This table depends on unicode range definitions.
+// Each item's index must correspond to a unicode range value
+// eg. x-cyrillic = LangGroupTable[cRangeCyrillic]
+static const char* gUnicodeRangeToLangGroupTable[] =
+{
+ "x-cyrillic",
+ "el",
+ "tr",
+ "he",
+ "ar",
+ "x-baltic",
+ "th",
+ "ko",
+ "ja",
+ "zh-CN",
+ "zh-TW",
+ "x-devanagari",
+ "x-tamil",
+ "x-armn",
+ "x-beng",
+ "x-cans",
+ "x-ethi",
+ "x-geor",
+ "x-gujr",
+ "x-guru",
+ "x-khmr",
+ "x-mlym"
+};
+
+/**********************************************************************
+ * Unicode subranges as defined in unicode 3.0
+ * x-western, x-central-euro, tr, x-baltic -> latin
+ * 0000 - 036f
+ * 1e00 - 1eff
+ * 2000 - 206f (general punctuation)
+ * 20a0 - 20cf (currency symbols)
+ * 2100 - 214f (letterlike symbols)
+ * 2150 - 218f (Number Forms)
+ * el -> greek
+ * 0370 - 03ff
+ * 1f00 - 1fff
+ * x-cyrillic -> cyrillic
+ * 0400 - 04ff
+ * he -> hebrew
+ * 0590 - 05ff
+ * ar -> arabic
+ * 0600 - 06ff
+ * fb50 - fdff (arabic presentation forms)
+ * fe70 - feff (arabic presentation forms b)
+ * th - thai
+ * 0e00 - 0e7f
+ * ko -> korean
+ * ac00 - d7af (hangul Syllables)
+ * 1100 - 11ff (jamo)
+ * 3130 - 318f (hangul compatibility jamo)
+ * ja
+ * 3040 - 309f (hiragana)
+ * 30a0 - 30ff (katakana)
+ * zh-CN
+ * zh-TW
+ *
+ * CJK
+ * 3100 - 312f (bopomofo)
+ * 31a0 - 31bf (bopomofo extended)
+ * 3000 - 303f (CJK Symbols and Punctuation)
+ * 2e80 - 2eff (CJK radicals supplement)
+ * 2f00 - 2fdf (Kangxi Radicals)
+ * 2ff0 - 2fff (Ideographic Description Characters)
+ * 3190 - 319f (kanbun)
+ * 3200 - 32ff (Enclosed CJK letters and Months)
+ * 3300 - 33ff (CJK compatibility)
+ * 3400 - 4dbf (CJK Unified Ideographs Extension A)
+ * 4e00 - 9faf (CJK Unified Ideographs)
+ * f900 - fa5f (CJK Compatibility Ideographs)
+ * fe30 - fe4f (CJK compatibility Forms)
+ * ff00 - ffef (halfwidth and fullwidth forms)
+ *
+ * Armenian
+ * 0530 - 058f
+ * Sriac
+ * 0700 - 074f
+ * Thaana
+ * 0780 - 07bf
+ * Devanagari
+ * 0900 - 097f
+ * Bengali
+ * 0980 - 09ff
+ * Gurmukhi
+ * 0a00 - 0a7f
+ * Gujarati
+ * 0a80 - 0aff
+ * Oriya
+ * 0b00 - 0b7f
+ * Tamil
+ * 0b80 - 0bff
+ * Telugu
+ * 0c00 - 0c7f
+ * Kannada
+ * 0c80 - 0cff
+ * Malayalam
+ * 0d00 - 0d7f
+ * Sinhala
+ * 0d80 - 0def
+ * Lao
+ * 0e80 - 0eff
+ * Tibetan
+ * 0f00 - 0fbf
+ * Myanmar
+ * 1000 - 109f
+ * Georgian
+ * 10a0 - 10ff
+ * Ethiopic
+ * 1200 - 137f
+ * Cherokee
+ * 13a0 - 13ff
+ * Canadian Aboriginal Syllabics
+ * 1400 - 167f
+ * Ogham
+ * 1680 - 169f
+ * Runic
+ * 16a0 - 16ff
+ * Khmer
+ * 1780 - 17ff
+ * Mongolian
+ * 1800 - 18af
+ * Misc - superscripts and subscripts
+ * 2070 - 209f
+ * Misc - Combining Diacritical Marks for Symbols
+ * 20d0 - 20ff
+ * Misc - Arrows
+ * 2190 - 21ff
+ * Misc - Mathematical Operators
+ * 2200 - 22ff
+ * Misc - Miscellaneous Technical
+ * 2300 - 23ff
+ * Misc - Control picture
+ * 2400 - 243f
+ * Misc - Optical character recognition
+ * 2440 - 2450
+ * Misc - Enclose Alphanumerics
+ * 2460 - 24ff
+ * Misc - Box Drawing
+ * 2500 - 257f
+ * Misc - Block Elements
+ * 2580 - 259f
+ * Misc - Geometric Shapes
+ * 25a0 - 25ff
+ * Misc - Miscellaneous Symbols
+ * 2600 - 267f
+ * Misc - Dingbats
+ * 2700 - 27bf
+ * Misc - Braille Patterns
+ * 2800 - 28ff
+ * Yi Syllables
+ * a000 - a48f
+ * Yi radicals
+ * a490 - a4cf
+ * Alphabetic Presentation Forms
+ * fb00 - fb4f
+ * Misc - Combining half Marks
+ * fe20 - fe2f
+ * Misc - small form variants
+ * fe50 - fe6f
+ * Misc - Specials
+ * fff0 - ffff
+ *********************************************************************/
+
+static const unsigned cNumSubTables = 9;
+static const unsigned cSubTableSize = 16;
+
+static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] =
+{
+ { // table for X---
+ cRangeTableBase+1, //u0xxx
+ cRangeTableBase+2, //u1xxx
+ cRangeTableBase+3, //u2xxx
+ cRangeSetCJK, //u3xxx
+ cRangeSetCJK, //u4xxx
+ cRangeSetCJK, //u5xxx
+ cRangeSetCJK, //u6xxx
+ cRangeSetCJK, //u7xxx
+ cRangeSetCJK, //u8xxx
+ cRangeSetCJK, //u9xxx
+ cRangeTableBase+4, //uaxxx
+ cRangeKorean, //ubxxx
+ cRangeKorean, //ucxxx
+ cRangeTableBase+5, //udxxx
+ cRangePrivate, //uexxx
+ cRangeTableBase+6 //ufxxx
+ },
+ { //table for 0X--
+ cRangeSetLatin, //u00xx
+ cRangeSetLatin, //u01xx
+ cRangeSetLatin, //u02xx
+ cRangeGreek, //u03xx XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks
+ cRangeCyrillic, //u04xx
+ cRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
+ cRangeArabic, //u06xx
+ cRangeTertiaryTable, //u07xx
+ cRangeUnassigned, //u08xx
+ cRangeTertiaryTable, //u09xx
+ cRangeTertiaryTable, //u0axx
+ cRangeTertiaryTable, //u0bxx
+ cRangeTertiaryTable, //u0cxx
+ cRangeTertiaryTable, //u0dxx
+ cRangeTertiaryTable, //u0exx
+ cRangeTibetan, //u0fxx
+ },
+ { //table for 1x--
+ cRangeTertiaryTable, //u10xx
+ cRangeKorean, //u11xx
+ cRangeEthiopic, //u12xx
+ cRangeTertiaryTable, //u13xx
+ cRangeCanadian, //u14xx
+ cRangeCanadian, //u15xx
+ cRangeTertiaryTable, //u16xx
+ cRangeKhmer, //u17xx
+ cRangeMongolian, //u18xx
+ cRangeUnassigned, //u19xx
+ cRangeUnassigned, //u1axx
+ cRangeUnassigned, //u1bxx
+ cRangeUnassigned, //u1cxx
+ cRangeUnassigned, //u1dxx
+ cRangeSetLatin, //u1exx
+ cRangeGreek, //u1fxx
+ },
+ { //table for 2x--
+ cRangeSetLatin, //u20xx
+ cRangeSetLatin, //u21xx
+ cRangeMathOperators, //u22xx
+ cRangeMiscTechnical, //u23xx
+ cRangeControlOpticalEnclose, //u24xx
+ cRangeBoxBlockGeometrics, //u25xx
+ cRangeMiscSymbols, //u26xx
+ cRangeDingbats, //u27xx
+ cRangeBraillePattern, //u28xx
+ cRangeUnassigned, //u29xx
+ cRangeUnassigned, //u2axx
+ cRangeUnassigned, //u2bxx
+ cRangeUnassigned, //u2cxx
+ cRangeUnassigned, //u2dxx
+ cRangeSetCJK, //u2exx
+ cRangeSetCJK, //u2fxx
+ },
+ { //table for ax--
+ cRangeYi, //ua0xx
+ cRangeYi, //ua1xx
+ cRangeYi, //ua2xx
+ cRangeYi, //ua3xx
+ cRangeYi, //ua4xx
+ cRangeUnassigned, //ua5xx
+ cRangeUnassigned, //ua6xx
+ cRangeUnassigned, //ua7xx
+ cRangeUnassigned, //ua8xx
+ cRangeUnassigned, //ua9xx
+ cRangeUnassigned, //uaaxx
+ cRangeUnassigned, //uabxx
+ cRangeKorean, //uacxx
+ cRangeKorean, //uadxx
+ cRangeKorean, //uaexx
+ cRangeKorean, //uafxx
+ },
+ { //table for dx--
+ cRangeKorean, //ud0xx
+ cRangeKorean, //ud1xx
+ cRangeKorean, //ud2xx
+ cRangeKorean, //ud3xx
+ cRangeKorean, //ud4xx
+ cRangeKorean, //ud5xx
+ cRangeKorean, //ud6xx
+ cRangeKorean, //ud7xx
+ cRangeSurrogate, //ud8xx
+ cRangeSurrogate, //ud9xx
+ cRangeSurrogate, //udaxx
+ cRangeSurrogate, //udbxx
+ cRangeSurrogate, //udcxx
+ cRangeSurrogate, //uddxx
+ cRangeSurrogate, //udexx
+ cRangeSurrogate, //udfxx
+ },
+ { // table for fx--
+ cRangePrivate, //uf0xx
+ cRangePrivate, //uf1xx
+ cRangePrivate, //uf2xx
+ cRangePrivate, //uf3xx
+ cRangePrivate, //uf4xx
+ cRangePrivate, //uf5xx
+ cRangePrivate, //uf6xx
+ cRangePrivate, //uf7xx
+ cRangePrivate, //uf8xx
+ cRangeSetCJK, //uf9xx
+ cRangeSetCJK, //ufaxx
+ cRangeArabic, //ufbxx, includes alphabic presentation form
+ cRangeArabic, //ufcxx
+ cRangeArabic, //ufdxx
+ cRangeArabic, //ufexx, includes Combining half marks,
+ // CJK compatibility forms,
+ // CJK compatibility forms,
+ // small form variants
+ cRangeTableBase+8, //uffxx, halfwidth and fullwidth forms, includes Specials
+ },
+ { //table for 0x0500 - 0x05ff
+ cRangeCyrillic, //u050x
+ cRangeCyrillic, //u051x
+ cRangeCyrillic, //u052x
+ cRangeArmenian, //u053x
+ cRangeArmenian, //u054x
+ cRangeArmenian, //u055x
+ cRangeArmenian, //u056x
+ cRangeArmenian, //u057x
+ cRangeArmenian, //u058x
+ cRangeHebrew, //u059x
+ cRangeHebrew, //u05ax
+ cRangeHebrew, //u05bx
+ cRangeHebrew, //u05cx
+ cRangeHebrew, //u05dx
+ cRangeHebrew, //u05ex
+ cRangeHebrew, //u05fx
+ },
+ { //table for 0xff00 - 0xffff
+ cRangeSetCJK, //uff0x, fullwidth latin
+ cRangeSetCJK, //uff1x, fullwidth latin
+ cRangeSetCJK, //uff2x, fullwidth latin
+ cRangeSetCJK, //uff3x, fullwidth latin
+ cRangeSetCJK, //uff4x, fullwidth latin
+ cRangeSetCJK, //uff5x, fullwidth latin
+ cRangeSetCJK, //uff6x, halfwidth katakana
+ cRangeSetCJK, //uff7x, halfwidth katakana
+ cRangeSetCJK, //uff8x, halfwidth katakana
+ cRangeSetCJK, //uff9x, halfwidth katakana
+ cRangeSetCJK, //uffax, halfwidth hangul jamo
+ cRangeSetCJK, //uffbx, halfwidth hangul jamo
+ cRangeSetCJK, //uffcx, halfwidth hangul jamo
+ cRangeSetCJK, //uffdx, halfwidth hangul jamo
+ cRangeSetCJK, //uffex, fullwidth symbols
+ cRangeSpecials, //ufffx, Specials
+ },
+};
+
+// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
+// code points so that the number of entries in the tertiary range
+// table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
+// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
+// syllabaries take multiple chunks and Ogham and Runic share a single chunk.
+static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80);
+
+static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] =
+{ //table for 0x0700 - 0x1600
+ cRangeSyriac, //u070x
+ cRangeThaana, //u078x
+ cRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.)
+ cRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.)
+ cRangeDevanagari, //u090x
+ cRangeBengali, //u098x
+ cRangeGurmukhi, //u0a0x
+ cRangeGujarati, //u0a8x
+ cRangeOriya, //u0b0x
+ cRangeTamil, //u0b8x
+ cRangeTelugu, //u0c0x
+ cRangeKannada, //u0c8x
+ cRangeMalayalam, //u0d0x
+ cRangeSinhala, //u0d8x
+ cRangeThai, //u0e0x
+ cRangeLao, //u0e8x
+ cRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.)
+ cRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.)
+ cRangeMyanmar, //u100x
+ cRangeGeorgian, //u108x
+ cRangeKorean, //u110x place holder(resolved in the 2ndary tab.)
+ cRangeKorean, //u118x place holder(resolved in the 2ndary tab.)
+ cRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.)
+ cRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.)
+ cRangeEthiopic, //u130x
+ cRangeCherokee, //u138x
+ cRangeCanadian, //u140x place holder(resolved in the 2ndary tab.)
+ cRangeCanadian, //u148x place holder(resolved in the 2ndary tab.)
+ cRangeCanadian, //u150x place holder(resolved in the 2ndary tab.)
+ cRangeCanadian, //u158x place holder(resolved in the 2ndary tab.)
+ cRangeCanadian, //u160x
+ cRangeOghamRunic, //u168x this contains two scripts, Ogham & Runic
+};
+
+// A two level index is almost enough for locating a range, with the
+// exception of u03xx and u05xx. Since we don't really care about range for
+// combining diacritical marks in our font application, they are
+// not discriminated further. Future adoption of this method for other use
+// should be aware of this limitation. The implementation can be extended if
+// there is such a need.
+// For Indic, Southeast Asian scripts and some other scripts between
+// U+0700 and U+16FF, it's extended to the third level.
+unsigned int findCharUnicodeRange(UChar32 ch)
+{
+ if (ch >= 0xFFFF)
+ return 0;
+
+ unsigned int range;
+
+ //search the first table
+ range = gUnicodeSubrangeTable[0][ch >> 12];
+
+ if (range < cRangeTableBase)
+ // we try to get a specific range
+ return range;
+
+ // otherwise, we have one more table to look at
+ range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8];
+ if (range < cRangeTableBase)
+ return range;
+ if (range < cRangeTertiaryTable)
+ return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4];
+
+ // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
+ return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
+}
+
+const char* langGroupFromUnicodeRange(unsigned char unicodeRange)
+{
+ if (cRangeSpecificItemNum > unicodeRange)
+ return gUnicodeRangeToLangGroupTable[unicodeRange];
+ return 0;
+}
+
+}