uchar.h File Reference

C API: Unicode Properties. More...

#include "unicode/utypes.h"

Go to the source code of this file.

Defines

#define U_UNICODE_VERSION   "5.1"
 Unicode version number, default for the current ICU version.
#define UCHAR_MIN_VALUE   0
 Constants.
#define UCHAR_MAX_VALUE   0x10ffff
 The highest Unicode code point value (scalar value) according to The Unicode Standard.
#define U_MASK(x)   ((uint32_t)1<<(x))
 Get a single-bit bit set (a flag) from a bit number 0.
#define U_GC_CN_MASK   U_MASK(U_GENERAL_OTHER_TYPES)
 U_GC_XX_MASK constants are bit flags corresponding to Unicode general category values.
#define U_GC_LU_MASK   U_MASK(U_UPPERCASE_LETTER)
 Mask constant for a UCharCategory.
#define U_GC_LL_MASK   U_MASK(U_LOWERCASE_LETTER)
 Mask constant for a UCharCategory.
#define U_GC_LT_MASK   U_MASK(U_TITLECASE_LETTER)
 Mask constant for a UCharCategory.
#define U_GC_LM_MASK   U_MASK(U_MODIFIER_LETTER)
 Mask constant for a UCharCategory.
#define U_GC_LO_MASK   U_MASK(U_OTHER_LETTER)
 Mask constant for a UCharCategory.
#define U_GC_MN_MASK   U_MASK(U_NON_SPACING_MARK)
 Mask constant for a UCharCategory.
#define U_GC_ME_MASK   U_MASK(U_ENCLOSING_MARK)
 Mask constant for a UCharCategory.
#define U_GC_MC_MASK   U_MASK(U_COMBINING_SPACING_MARK)
 Mask constant for a UCharCategory.
#define U_GC_ND_MASK   U_MASK(U_DECIMAL_DIGIT_NUMBER)
 Mask constant for a UCharCategory.
#define U_GC_NL_MASK   U_MASK(U_LETTER_NUMBER)
 Mask constant for a UCharCategory.
#define U_GC_NO_MASK   U_MASK(U_OTHER_NUMBER)
 Mask constant for a UCharCategory.
#define U_GC_ZS_MASK   U_MASK(U_SPACE_SEPARATOR)
 Mask constant for a UCharCategory.
#define U_GC_ZL_MASK   U_MASK(U_LINE_SEPARATOR)
 Mask constant for a UCharCategory.
#define U_GC_ZP_MASK   U_MASK(U_PARAGRAPH_SEPARATOR)
 Mask constant for a UCharCategory.
#define U_GC_CC_MASK   U_MASK(U_CONTROL_CHAR)
 Mask constant for a UCharCategory.
#define U_GC_CF_MASK   U_MASK(U_FORMAT_CHAR)
 Mask constant for a UCharCategory.
#define U_GC_CO_MASK   U_MASK(U_PRIVATE_USE_CHAR)
 Mask constant for a UCharCategory.
#define U_GC_CS_MASK   U_MASK(U_SURROGATE)
 Mask constant for a UCharCategory.
#define U_GC_PD_MASK   U_MASK(U_DASH_PUNCTUATION)
 Mask constant for a UCharCategory.
#define U_GC_PS_MASK   U_MASK(U_START_PUNCTUATION)
 Mask constant for a UCharCategory.
#define U_GC_PE_MASK   U_MASK(U_END_PUNCTUATION)
 Mask constant for a UCharCategory.
#define U_GC_PC_MASK   U_MASK(U_CONNECTOR_PUNCTUATION)
 Mask constant for a UCharCategory.
#define U_GC_PO_MASK   U_MASK(U_OTHER_PUNCTUATION)
 Mask constant for a UCharCategory.
#define U_GC_SM_MASK   U_MASK(U_MATH_SYMBOL)
 Mask constant for a UCharCategory.
#define U_GC_SC_MASK   U_MASK(U_CURRENCY_SYMBOL)
 Mask constant for a UCharCategory.
#define U_GC_SK_MASK   U_MASK(U_MODIFIER_SYMBOL)
 Mask constant for a UCharCategory.
#define U_GC_SO_MASK   U_MASK(U_OTHER_SYMBOL)
 Mask constant for a UCharCategory.
#define U_GC_PI_MASK   U_MASK(U_INITIAL_PUNCTUATION)
 Mask constant for a UCharCategory.
#define U_GC_PF_MASK   U_MASK(U_FINAL_PUNCTUATION)
 Mask constant for a UCharCategory.
#define U_GC_L_MASK   (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK|U_GC_LM_MASK|U_GC_LO_MASK)
 Mask constant for multiple UCharCategory bits (L Letters).
#define U_GC_LC_MASK   (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK)
 Mask constant for multiple UCharCategory bits (LC Cased Letters).
#define U_GC_M_MASK   (U_GC_MN_MASK|U_GC_ME_MASK|U_GC_MC_MASK)
 Mask constant for multiple UCharCategory bits (M Marks).
#define U_GC_N_MASK   (U_GC_ND_MASK|U_GC_NL_MASK|U_GC_NO_MASK)
 Mask constant for multiple UCharCategory bits (N Numbers).
#define U_GC_Z_MASK   (U_GC_ZS_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK)
 Mask constant for multiple UCharCategory bits (Z Separators).
#define U_GC_C_MASK   (U_GC_CN_MASK|U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CO_MASK|U_GC_CS_MASK)
 Mask constant for multiple UCharCategory bits (C Others).
#define U_GC_P_MASK
 Mask constant for multiple UCharCategory bits (P Punctuation).
#define U_GC_S_MASK   (U_GC_SM_MASK|U_GC_SC_MASK|U_GC_SK_MASK|U_GC_SO_MASK)
 Mask constant for multiple UCharCategory bits (S Symbols).
#define U_NO_NUMERIC_VALUE   ((double)-123456789.)
 Special value that is returned by u_getNumericValue when no numeric value is defined for a code point.
#define U_GET_GC_MASK(c)   U_MASK(u_charType(c))
 Get a single-bit bit set for the general category of a character.
#define U_FOLD_CASE_DEFAULT   0
 Option value for case folding: use default mappings defined in CaseFolding.txt.
#define U_FOLD_CASE_EXCLUDE_SPECIAL_I   1
 Option value for case folding:.

Typedefs

typedef UBool U_CALLCONV UCharEnumTypeRange (const void *context, UChar32 start, UChar32 limit, UCharCategory type)
 Callback from u_enumCharTypes(), is called for each contiguous range of code points c (where start<=c<limit) with the same Unicode general category ("character type").
typedef UBool U_CALLCONV UEnumCharNamesFn (void *context, UChar32 code, UCharNameChoice nameChoice, const char *name, int32_t length)
 Type of a callback function for u_enumCharNames() that gets called for each Unicode character with the code point value and the character name.

Enumerations

enum  UProperty {
  UCHAR_ALPHABETIC = 0, UCHAR_BINARY_START = UCHAR_ALPHABETIC, UCHAR_ASCII_HEX_DIGIT = 1, UCHAR_BIDI_CONTROL = 2,
  UCHAR_BIDI_MIRRORED = 3, UCHAR_DASH = 4, UCHAR_DEFAULT_IGNORABLE_CODE_POINT = 5, UCHAR_DEPRECATED = 6,
  UCHAR_DIACRITIC = 7, UCHAR_EXTENDER = 8, UCHAR_FULL_COMPOSITION_EXCLUSION = 9, UCHAR_GRAPHEME_BASE = 10,
  UCHAR_GRAPHEME_EXTEND = 11, UCHAR_GRAPHEME_LINK = 12, UCHAR_HEX_DIGIT = 13, UCHAR_HYPHEN = 14,
  UCHAR_ID_CONTINUE = 15, UCHAR_ID_START = 16, UCHAR_IDEOGRAPHIC = 17, UCHAR_IDS_BINARY_OPERATOR = 18,
  UCHAR_IDS_TRINARY_OPERATOR = 19, UCHAR_JOIN_CONTROL = 20, UCHAR_LOGICAL_ORDER_EXCEPTION = 21, UCHAR_LOWERCASE = 22,
  UCHAR_MATH = 23, UCHAR_NONCHARACTER_CODE_POINT = 24, UCHAR_QUOTATION_MARK = 25, UCHAR_RADICAL = 26,
  UCHAR_SOFT_DOTTED = 27, UCHAR_TERMINAL_PUNCTUATION = 28, UCHAR_UNIFIED_IDEOGRAPH = 29, UCHAR_UPPERCASE = 30,
  UCHAR_WHITE_SPACE = 31, UCHAR_XID_CONTINUE = 32, UCHAR_XID_START = 33, UCHAR_CASE_SENSITIVE = 34,
  UCHAR_S_TERM = 35, UCHAR_VARIATION_SELECTOR = 36, UCHAR_NFD_INERT = 37, UCHAR_NFKD_INERT = 38,
  UCHAR_NFC_INERT = 39, UCHAR_NFKC_INERT = 40, UCHAR_SEGMENT_STARTER = 41, UCHAR_PATTERN_SYNTAX = 42,
  UCHAR_PATTERN_WHITE_SPACE = 43, UCHAR_POSIX_ALNUM = 44, UCHAR_POSIX_BLANK = 45, UCHAR_POSIX_GRAPH = 46,
  UCHAR_POSIX_PRINT = 47, UCHAR_POSIX_XDIGIT = 48, UCHAR_BINARY_LIMIT = 49, UCHAR_BIDI_CLASS = 0x1000,
  UCHAR_INT_START = UCHAR_BIDI_CLASS, UCHAR_BLOCK = 0x1001, UCHAR_CANONICAL_COMBINING_CLASS = 0x1002, UCHAR_DECOMPOSITION_TYPE = 0x1003,
  UCHAR_EAST_ASIAN_WIDTH = 0x1004, UCHAR_GENERAL_CATEGORY = 0x1005, UCHAR_JOINING_GROUP = 0x1006, UCHAR_JOINING_TYPE = 0x1007,
  UCHAR_LINE_BREAK = 0x1008, UCHAR_NUMERIC_TYPE = 0x1009, UCHAR_SCRIPT = 0x100A, UCHAR_HANGUL_SYLLABLE_TYPE = 0x100B,
  UCHAR_NFD_QUICK_CHECK = 0x100C, UCHAR_NFKD_QUICK_CHECK = 0x100D, UCHAR_NFC_QUICK_CHECK = 0x100E, UCHAR_NFKC_QUICK_CHECK = 0x100F,
  UCHAR_LEAD_CANONICAL_COMBINING_CLASS = 0x1010, UCHAR_TRAIL_CANONICAL_COMBINING_CLASS = 0x1011, UCHAR_GRAPHEME_CLUSTER_BREAK = 0x1012, UCHAR_SENTENCE_BREAK = 0x1013,
  UCHAR_WORD_BREAK = 0x1014, UCHAR_INT_LIMIT = 0x1015, UCHAR_GENERAL_CATEGORY_MASK = 0x2000, UCHAR_MASK_START = UCHAR_GENERAL_CATEGORY_MASK,
  UCHAR_MASK_LIMIT = 0x2001, UCHAR_NUMERIC_VALUE = 0x3000, UCHAR_DOUBLE_START = UCHAR_NUMERIC_VALUE, UCHAR_DOUBLE_LIMIT = 0x3001,
  UCHAR_AGE = 0x4000, UCHAR_STRING_START = UCHAR_AGE, UCHAR_BIDI_MIRRORING_GLYPH = 0x4001, UCHAR_CASE_FOLDING = 0x4002,
  UCHAR_ISO_COMMENT = 0x4003, UCHAR_LOWERCASE_MAPPING = 0x4004, UCHAR_NAME = 0x4005, UCHAR_SIMPLE_CASE_FOLDING = 0x4006,
  UCHAR_SIMPLE_LOWERCASE_MAPPING = 0x4007, UCHAR_SIMPLE_TITLECASE_MAPPING = 0x4008, UCHAR_SIMPLE_UPPERCASE_MAPPING = 0x4009, UCHAR_TITLECASE_MAPPING = 0x400A,
  UCHAR_UNICODE_1_NAME = 0x400B, UCHAR_UPPERCASE_MAPPING = 0x400C, UCHAR_STRING_LIMIT = 0x400D, UCHAR_INVALID_CODE = -1
}
 Selection constants for Unicode properties. More...
enum  UCharCategory {
  U_UNASSIGNED = 0, U_GENERAL_OTHER_TYPES = 0, U_UPPERCASE_LETTER = 1, U_LOWERCASE_LETTER = 2,
  U_TITLECASE_LETTER = 3, U_MODIFIER_LETTER = 4, U_OTHER_LETTER = 5, U_NON_SPACING_MARK = 6,
  U_ENCLOSING_MARK = 7, U_COMBINING_SPACING_MARK = 8, U_DECIMAL_DIGIT_NUMBER = 9, U_LETTER_NUMBER = 10,
  U_OTHER_NUMBER = 11, U_SPACE_SEPARATOR = 12, U_LINE_SEPARATOR = 13, U_PARAGRAPH_SEPARATOR = 14,
  U_CONTROL_CHAR = 15, U_FORMAT_CHAR = 16, U_PRIVATE_USE_CHAR = 17, U_SURROGATE = 18,
  U_DASH_PUNCTUATION = 19, U_START_PUNCTUATION = 20, U_END_PUNCTUATION = 21, U_CONNECTOR_PUNCTUATION = 22,
  U_OTHER_PUNCTUATION = 23, U_MATH_SYMBOL = 24, U_CURRENCY_SYMBOL = 25, U_MODIFIER_SYMBOL = 26,
  U_OTHER_SYMBOL = 27, U_INITIAL_PUNCTUATION = 28, U_FINAL_PUNCTUATION = 29, U_CHAR_CATEGORY_COUNT
}
 Data for enumerated Unicode general category types. More...
enum  UCharDirection {
  U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3,
  U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7,
  U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11,
  U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15,
  U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT
}
 This specifies the language directional property of a character set. More...
enum  UBlockCode {
  UBLOCK_NO_BLOCK = 0, UBLOCK_BASIC_LATIN = 1, UBLOCK_LATIN_1_SUPPLEMENT = 2, UBLOCK_LATIN_EXTENDED_A = 3,
  UBLOCK_LATIN_EXTENDED_B = 4, UBLOCK_IPA_EXTENSIONS = 5, UBLOCK_SPACING_MODIFIER_LETTERS = 6, UBLOCK_COMBINING_DIACRITICAL_MARKS = 7,
  UBLOCK_GREEK = 8, UBLOCK_CYRILLIC = 9, UBLOCK_ARMENIAN = 10, UBLOCK_HEBREW = 11,
  UBLOCK_ARABIC = 12, UBLOCK_SYRIAC = 13, UBLOCK_THAANA = 14, UBLOCK_DEVANAGARI = 15,
  UBLOCK_BENGALI = 16, UBLOCK_GURMUKHI = 17, UBLOCK_GUJARATI = 18, UBLOCK_ORIYA = 19,
  UBLOCK_TAMIL = 20, UBLOCK_TELUGU = 21, UBLOCK_KANNADA = 22, UBLOCK_MALAYALAM = 23,
  UBLOCK_SINHALA = 24, UBLOCK_THAI = 25, UBLOCK_LAO = 26, UBLOCK_TIBETAN = 27,
  UBLOCK_MYANMAR = 28, UBLOCK_GEORGIAN = 29, UBLOCK_HANGUL_JAMO = 30, UBLOCK_ETHIOPIC = 31,
  UBLOCK_CHEROKEE = 32, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, UBLOCK_OGHAM = 34, UBLOCK_RUNIC = 35,
  UBLOCK_KHMER = 36, UBLOCK_MONGOLIAN = 37, UBLOCK_LATIN_EXTENDED_ADDITIONAL = 38, UBLOCK_GREEK_EXTENDED = 39,
  UBLOCK_GENERAL_PUNCTUATION = 40, UBLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS = 41, UBLOCK_CURRENCY_SYMBOLS = 42, UBLOCK_COMBINING_MARKS_FOR_SYMBOLS = 43,
  UBLOCK_LETTERLIKE_SYMBOLS = 44, UBLOCK_NUMBER_FORMS = 45, UBLOCK_ARROWS = 46, UBLOCK_MATHEMATICAL_OPERATORS = 47,
  UBLOCK_MISCELLANEOUS_TECHNICAL = 48, UBLOCK_CONTROL_PICTURES = 49, UBLOCK_OPTICAL_CHARACTER_RECOGNITION = 50, UBLOCK_ENCLOSED_ALPHANUMERICS = 51,
  UBLOCK_BOX_DRAWING = 52, UBLOCK_BLOCK_ELEMENTS = 53, UBLOCK_GEOMETRIC_SHAPES = 54, UBLOCK_MISCELLANEOUS_SYMBOLS = 55,
  UBLOCK_DINGBATS = 56, UBLOCK_BRAILLE_PATTERNS = 57, UBLOCK_CJK_RADICALS_SUPPLEMENT = 58, UBLOCK_KANGXI_RADICALS = 59,
  UBLOCK_IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION = 61, UBLOCK_HIRAGANA = 62, UBLOCK_KATAKANA = 63,
  UBLOCK_BOPOMOFO = 64, UBLOCK_HANGUL_COMPATIBILITY_JAMO = 65, UBLOCK_KANBUN = 66, UBLOCK_BOPOMOFO_EXTENDED = 67,
  UBLOCK_ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, UBLOCK_CJK_COMPATIBILITY = 69, UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, UBLOCK_CJK_UNIFIED_IDEOGRAPHS = 71,
  UBLOCK_YI_SYLLABLES = 72, UBLOCK_YI_RADICALS = 73, UBLOCK_HANGUL_SYLLABLES = 74, UBLOCK_HIGH_SURROGATES = 75,
  UBLOCK_HIGH_PRIVATE_USE_SURROGATES = 76, UBLOCK_LOW_SURROGATES = 77, UBLOCK_PRIVATE_USE = 78, UBLOCK_PRIVATE_USE_AREA = UBLOCK_PRIVATE_USE,
  UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS = 79, UBLOCK_ALPHABETIC_PRESENTATION_FORMS = 80, UBLOCK_ARABIC_PRESENTATION_FORMS_A = 81, UBLOCK_COMBINING_HALF_MARKS = 82,
  UBLOCK_CJK_COMPATIBILITY_FORMS = 83, UBLOCK_SMALL_FORM_VARIANTS = 84, UBLOCK_ARABIC_PRESENTATION_FORMS_B = 85, UBLOCK_SPECIALS = 86,
  UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS = 87, UBLOCK_OLD_ITALIC = 88, UBLOCK_GOTHIC = 89, UBLOCK_DESERET = 90,
  UBLOCK_BYZANTINE_MUSICAL_SYMBOLS = 91, UBLOCK_MUSICAL_SYMBOLS = 92, UBLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94,
  UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, UBLOCK_TAGS = 96, UBLOCK_CYRILLIC_SUPPLEMENTARY = 97, UBLOCK_CYRILLIC_SUPPLEMENT = UBLOCK_CYRILLIC_SUPPLEMENTARY,
  UBLOCK_TAGALOG = 98, UBLOCK_HANUNOO = 99, UBLOCK_BUHID = 100, UBLOCK_TAGBANWA = 101,
  UBLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 102, UBLOCK_SUPPLEMENTAL_ARROWS_A = 103, UBLOCK_SUPPLEMENTAL_ARROWS_B = 104, UBLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 105,
  UBLOCK_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 106, UBLOCK_KATAKANA_PHONETIC_EXTENSIONS = 107, UBLOCK_VARIATION_SELECTORS = 108, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 109,
  UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 110, UBLOCK_LIMBU = 111, UBLOCK_TAI_LE = 112, UBLOCK_KHMER_SYMBOLS = 113,
  UBLOCK_PHONETIC_EXTENSIONS = 114, UBLOCK_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 115, UBLOCK_YIJING_HEXAGRAM_SYMBOLS = 116, UBLOCK_LINEAR_B_SYLLABARY = 117,
  UBLOCK_LINEAR_B_IDEOGRAMS = 118, UBLOCK_AEGEAN_NUMBERS = 119, UBLOCK_UGARITIC = 120, UBLOCK_SHAVIAN = 121,
  UBLOCK_OSMANYA = 122, UBLOCK_CYPRIOT_SYLLABARY = 123, UBLOCK_TAI_XUAN_JING_SYMBOLS = 124, UBLOCK_VARIATION_SELECTORS_SUPPLEMENT = 125,
  UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION = 126, UBLOCK_ANCIENT_GREEK_NUMBERS = 127, UBLOCK_ARABIC_SUPPLEMENT = 128, UBLOCK_BUGINESE = 129,
  UBLOCK_CJK_STROKES = 130, UBLOCK_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 131, UBLOCK_COPTIC = 132, UBLOCK_ETHIOPIC_EXTENDED = 133,
  UBLOCK_ETHIOPIC_SUPPLEMENT = 134, UBLOCK_GEORGIAN_SUPPLEMENT = 135, UBLOCK_GLAGOLITIC = 136, UBLOCK_KHAROSHTHI = 137,
  UBLOCK_MODIFIER_TONE_LETTERS = 138, UBLOCK_NEW_TAI_LUE = 139, UBLOCK_OLD_PERSIAN = 140, UBLOCK_PHONETIC_EXTENSIONS_SUPPLEMENT = 141,
  UBLOCK_SUPPLEMENTAL_PUNCTUATION = 142, UBLOCK_SYLOTI_NAGRI = 143, UBLOCK_TIFINAGH = 144, UBLOCK_VERTICAL_FORMS = 145,
  UBLOCK_NKO = 146, UBLOCK_BALINESE = 147, UBLOCK_LATIN_EXTENDED_C = 148, UBLOCK_LATIN_EXTENDED_D = 149,
  UBLOCK_PHAGS_PA = 150, UBLOCK_PHOENICIAN = 151, UBLOCK_CUNEIFORM = 152, UBLOCK_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 153,
  UBLOCK_COUNTING_ROD_NUMERALS = 154, UBLOCK_SUNDANESE = 155, UBLOCK_LEPCHA = 156, UBLOCK_OL_CHIKI = 157,
  UBLOCK_CYRILLIC_EXTENDED_A = 158, UBLOCK_VAI = 159, UBLOCK_CYRILLIC_EXTENDED_B = 160, UBLOCK_SAURASHTRA = 161,
  UBLOCK_KAYAH_LI = 162, UBLOCK_REJANG = 163, UBLOCK_CHAM = 164, UBLOCK_ANCIENT_SYMBOLS = 165,
  UBLOCK_PHAISTOS_DISC = 166, UBLOCK_LYCIAN = 167, UBLOCK_CARIAN = 168, UBLOCK_LYDIAN = 169,
  UBLOCK_MAHJONG_TILES = 170, UBLOCK_DOMINO_TILES = 171, UBLOCK_COUNT = 172, UBLOCK_INVALID_CODE = -1
}
 Constants for Unicode blocks, see the Unicode Data file Blocks.txt. More...
enum  UEastAsianWidth {
  U_EA_NEUTRAL, U_EA_AMBIGUOUS, U_EA_HALFWIDTH, U_EA_FULLWIDTH,
  U_EA_NARROW, U_EA_WIDE, U_EA_COUNT
}
 East Asian Width constants. More...
enum  UCharNameChoice { U_UNICODE_CHAR_NAME, U_UNICODE_10_CHAR_NAME, U_EXTENDED_CHAR_NAME, U_CHAR_NAME_CHOICE_COUNT }
 Selector constants for u_charName(). More...
enum  UPropertyNameChoice { U_SHORT_PROPERTY_NAME, U_LONG_PROPERTY_NAME, U_PROPERTY_NAME_CHOICE_COUNT }
 Selector constants for u_getPropertyName() and u_getPropertyValueName(). More...
enum  UDecompositionType {
  U_DT_NONE, U_DT_CANONICAL, U_DT_COMPAT, U_DT_CIRCLE,
  U_DT_FINAL, U_DT_FONT, U_DT_FRACTION, U_DT_INITIAL,
  U_DT_ISOLATED, U_DT_MEDIAL, U_DT_NARROW, U_DT_NOBREAK,
  U_DT_SMALL, U_DT_SQUARE, U_DT_SUB, U_DT_SUPER,
  U_DT_VERTICAL, U_DT_WIDE, U_DT_COUNT
}
 Decomposition Type constants. More...
enum  UJoiningType {
  U_JT_NON_JOINING, U_JT_JOIN_CAUSING, U_JT_DUAL_JOINING, U_JT_LEFT_JOINING,
  U_JT_RIGHT_JOINING, U_JT_TRANSPARENT, U_JT_COUNT
}
 Joining Type constants. More...
enum  UJoiningGroup {
  U_JG_NO_JOINING_GROUP, U_JG_AIN, U_JG_ALAPH, U_JG_ALEF,
  U_JG_BEH, U_JG_BETH, U_JG_DAL, U_JG_DALATH_RISH,
  U_JG_E, U_JG_FEH, U_JG_FINAL_SEMKATH, U_JG_GAF,
  U_JG_GAMAL, U_JG_HAH, U_JG_HAMZA_ON_HEH_GOAL, U_JG_HE,
  U_JG_HEH, U_JG_HEH_GOAL, U_JG_HETH, U_JG_KAF,
  U_JG_KAPH, U_JG_KNOTTED_HEH, U_JG_LAM, U_JG_LAMADH,
  U_JG_MEEM, U_JG_MIM, U_JG_NOON, U_JG_NUN,
  U_JG_PE, U_JG_QAF, U_JG_QAPH, U_JG_REH,
  U_JG_REVERSED_PE, U_JG_SAD, U_JG_SADHE, U_JG_SEEN,
  U_JG_SEMKATH, U_JG_SHIN, U_JG_SWASH_KAF, U_JG_SYRIAC_WAW,
  U_JG_TAH, U_JG_TAW, U_JG_TEH_MARBUTA, U_JG_TETH,
  U_JG_WAW, U_JG_YEH, U_JG_YEH_BARREE, U_JG_YEH_WITH_TAIL,
  U_JG_YUDH, U_JG_YUDH_HE, U_JG_ZAIN, U_JG_FE,
  U_JG_KHAPH, U_JG_ZHAIN, U_JG_BURUSHASKI_YEH_BARREE, U_JG_COUNT
}
 Joining Group constants. More...
enum  UGraphemeClusterBreak {
  U_GCB_OTHER = 0, U_GCB_CONTROL = 1, U_GCB_CR = 2, U_GCB_EXTEND = 3,
  U_GCB_L = 4, U_GCB_LF = 5, U_GCB_LV = 6, U_GCB_LVT = 7,
  U_GCB_T = 8, U_GCB_V = 9, U_GCB_SPACING_MARK = 10, U_GCB_PREPEND = 11,
  U_GCB_COUNT = 12
}
 Grapheme Cluster Break constants. More...
enum  UWordBreakValues {
  U_WB_OTHER = 0, U_WB_ALETTER = 1, U_WB_FORMAT = 2, U_WB_KATAKANA = 3,
  U_WB_MIDLETTER = 4, U_WB_MIDNUM = 5, U_WB_NUMERIC = 6, U_WB_EXTENDNUMLET = 7,
  U_WB_CR = 8, U_WB_EXTEND = 9, U_WB_LF = 10, U_WB_MIDNUMLET = 11,
  U_WB_NEWLINE = 12, U_WB_COUNT = 13
}
 Word Break constants. More...
enum  USentenceBreak {
  U_SB_OTHER = 0, U_SB_ATERM = 1, U_SB_CLOSE = 2, U_SB_FORMAT = 3,
  U_SB_LOWER = 4, U_SB_NUMERIC = 5, U_SB_OLETTER = 6, U_SB_SEP = 7,
  U_SB_SP = 8, U_SB_STERM = 9, U_SB_UPPER = 10, U_SB_CR = 11,
  U_SB_EXTEND = 12, U_SB_LF = 13, U_SB_SCONTINUE = 14, U_SB_COUNT = 15
}
 Sentence Break constants. More...
enum  ULineBreak {
  U_LB_UNKNOWN = 0, U_LB_AMBIGUOUS = 1, U_LB_ALPHABETIC = 2, U_LB_BREAK_BOTH = 3,
  U_LB_BREAK_AFTER = 4, U_LB_BREAK_BEFORE = 5, U_LB_MANDATORY_BREAK = 6, U_LB_CONTINGENT_BREAK = 7,
  U_LB_CLOSE_PUNCTUATION = 8, U_LB_COMBINING_MARK = 9, U_LB_CARRIAGE_RETURN = 10, U_LB_EXCLAMATION = 11,
  U_LB_GLUE = 12, U_LB_HYPHEN = 13, U_LB_IDEOGRAPHIC = 14, U_LB_INSEPERABLE = 15,
  U_LB_INSEPARABLE = U_LB_INSEPERABLE, U_LB_INFIX_NUMERIC = 16, U_LB_LINE_FEED = 17, U_LB_NONSTARTER = 18,
  U_LB_NUMERIC = 19, U_LB_OPEN_PUNCTUATION = 20, U_LB_POSTFIX_NUMERIC = 21, U_LB_PREFIX_NUMERIC = 22,
  U_LB_QUOTATION = 23, U_LB_COMPLEX_CONTEXT = 24, U_LB_SURROGATE = 25, U_LB_SPACE = 26,
  U_LB_BREAK_SYMBOLS = 27, U_LB_ZWSPACE = 28, U_LB_NEXT_LINE = 29, U_LB_WORD_JOINER = 30,
  U_LB_H2 = 31, U_LB_H3 = 32, U_LB_JL = 33, U_LB_JT = 34,
  U_LB_JV = 35, U_LB_COUNT = 36
}
 Line Break constants. More...
enum  UNumericType {
  U_NT_NONE, U_NT_DECIMAL, U_NT_DIGIT, U_NT_NUMERIC,
  U_NT_COUNT
}
 Numeric Type constants. More...
enum  UHangulSyllableType {
  U_HST_NOT_APPLICABLE, U_HST_LEADING_JAMO, U_HST_VOWEL_JAMO, U_HST_TRAILING_JAMO,
  U_HST_LV_SYLLABLE, U_HST_LVT_SYLLABLE, U_HST_COUNT
}
 Hangul Syllable Type constants. More...

Functions

UBool u_hasBinaryProperty (UChar32 c, UProperty which)
 Check a binary Unicode property for a code point.
UBool u_isUAlphabetic (UChar32 c)
 Check if a code point has the Alphabetic Unicode property.
UBool u_isULowercase (UChar32 c)
 Check if a code point has the Lowercase Unicode property.
UBool u_isUUppercase (UChar32 c)
 Check if a code point has the Uppercase Unicode property.
UBool u_isUWhiteSpace (UChar32 c)
 Check if a code point has the White_Space Unicode property.
int32_t u_getIntPropertyValue (UChar32 c, UProperty which)
 Get the property value for an enumerated or integer Unicode property for a code point.
int32_t u_getIntPropertyMinValue (UProperty which)
 Get the minimum value for an enumerated/integer/binary Unicode property.
int32_t u_getIntPropertyMaxValue (UProperty which)
 Get the maximum value for an enumerated/integer/binary Unicode property.
double u_getNumericValue (UChar32 c)
 Get the numeric value for a Unicode code point as defined in the Unicode Character Database.
UBool u_islower (UChar32 c)
 Determines whether the specified code point has the general category "Ll" (lowercase letter).
UBool u_isupper (UChar32 c)
 Determines whether the specified code point has the general category "Lu" (uppercase letter).
UBool u_istitle (UChar32 c)
 Determines whether the specified code point is a titlecase letter.
UBool u_isdigit (UChar32 c)
 Determines whether the specified code point is a digit character according to Java.
UBool u_isalpha (UChar32 c)
 Determines whether the specified code point is a letter character.
UBool u_isalnum (UChar32 c)
 Determines whether the specified code point is an alphanumeric character (letter or digit) according to Java.
UBool u_isxdigit (UChar32 c)
 Determines whether the specified code point is a hexadecimal digit.
UBool u_ispunct (UChar32 c)
 Determines whether the specified code point is a punctuation character.
UBool u_isgraph (UChar32 c)
 Determines whether the specified code point is a "graphic" character (printable, excluding spaces).
UBool u_isblank (UChar32 c)
 Determines whether the specified code point is a "blank" or "horizontal space", a character that visibly separates words on a line.
UBool u_isdefined (UChar32 c)
 Determines whether the specified code point is "defined", which usually means that it is assigned a character.
UBool u_isspace (UChar32 c)
 Determines if the specified character is a space character or not.
UBool u_isJavaSpaceChar (UChar32 c)
 Determine if the specified code point is a space character according to Java.
UBool u_isWhitespace (UChar32 c)
 Determines if the specified code point is a whitespace character according to Java/ICU.
UBool u_iscntrl (UChar32 c)
 Determines whether the specified code point is a control character (as defined by this function).
UBool u_isISOControl (UChar32 c)
 Determines whether the specified code point is an ISO control code.
UBool u_isprint (UChar32 c)
 Determines whether the specified code point is a printable character.
UBool u_isbase (UChar32 c)
 Determines whether the specified code point is a base character.
UCharDirection u_charDirection (UChar32 c)
 Returns the bidirectional category value for the code point, which is used in the Unicode bidirectional algorithm (UAX #9 http://www.unicode.org/reports/tr9/).
UBool u_isMirrored (UChar32 c)
 Determines whether the code point has the Bidi_Mirrored property.
UChar32 u_charMirror (UChar32 c)
 Maps the specified character to a "mirror-image" character.
int8_t u_charType (UChar32 c)
 Returns the general category value for the code point.
void u_enumCharTypes (UCharEnumTypeRange *enumRange, const void *context)
 Enumerate efficiently all code points with their Unicode general categories.
uint8_t u_getCombiningClass (UChar32 c)
 Returns the combining class of the code point as specified in UnicodeData.txt.
int32_t u_charDigitValue (UChar32 c)
 Returns the decimal digit value of a decimal digit character.
UBlockCode ublock_getCode (UChar32 c)
 Returns the Unicode allocation block that contains the character.
int32_t u_charName (UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode)
 Retrieve the name of a Unicode character.
int32_t u_getISOComment (UChar32 c, char *dest, int32_t destCapacity, UErrorCode *pErrorCode)
 Get the ISO 10646 comment for a character.
UChar32 u_charFromName (UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode)
 Find a Unicode character by its name and return its code point value.
void u_enumCharNames (UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice, UErrorCode *pErrorCode)
 Enumerate all assigned Unicode characters between the start and limit code points (start inclusive, limit exclusive) and call a function for each, passing the code point value and the character name.
const char * u_getPropertyName (UProperty property, UPropertyNameChoice nameChoice)
 Return the Unicode name for a given property, as given in the Unicode database file PropertyAliases.txt.
UProperty u_getPropertyEnum (const char *alias)
 Return the UProperty enum for a given property name, as specified in the Unicode database file PropertyAliases.txt.
const char * u_getPropertyValueName (UProperty property, int32_t value, UPropertyNameChoice nameChoice)
 Return the Unicode name for a given property value, as given in the Unicode database file PropertyValueAliases.txt.
int32_t u_getPropertyValueEnum (UProperty property, const char *alias)
 Return the property value integer for a given value name, as specified in the Unicode database file PropertyValueAliases.txt.
UBool u_isIDStart (UChar32 c)
 Determines if the specified character is permissible as the first character in an identifier according to Unicode (The Unicode Standard, Version 3.0, chapter 5.16 Identifiers).
UBool u_isIDPart (UChar32 c)
 Determines if the specified character is permissible in an identifier according to Java.
UBool u_isIDIgnorable (UChar32 c)
 Determines if the specified character should be regarded as an ignorable character in an identifier, according to Java.
UBool u_isJavaIDStart (UChar32 c)
 Determines if the specified character is permissible as the first character in a Java identifier.
UBool u_isJavaIDPart (UChar32 c)
 Determines if the specified character is permissible in a Java identifier.
UChar32 u_tolower (UChar32 c)
 The given character is mapped to its lowercase equivalent according to UnicodeData.txt; if the character has no lowercase equivalent, the character itself is returned.
UChar32 u_toupper (UChar32 c)
 The given character is mapped to its uppercase equivalent according to UnicodeData.txt; if the character has no uppercase equivalent, the character itself is returned.
UChar32 u_totitle (UChar32 c)
 The given character is mapped to its titlecase equivalent according to UnicodeData.txt; if none is defined, the character itself is returned.
UChar32 u_foldCase (UChar32 c, uint32_t options)
 The given character is mapped to its case folding equivalent according to UnicodeData.txt and CaseFolding.txt; if the character has no case folding equivalent, the character itself is returned.
int32_t u_digit (UChar32 ch, int8_t radix)
 Returns the decimal digit value of the code point in the specified radix.
UChar32 u_forDigit (int32_t digit, int8_t radix)
 Determines the character representation for a specific digit in the specified radix.
void u_charAge (UChar32 c, UVersionInfo versionArray)
 Get the "age" of the code point.
void u_getUnicodeVersion (UVersionInfo versionArray)
 Gets the Unicode version information.
int32_t u_getFC_NFKC_Closure (UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode)
 Get the FC_NFKC_Closure property string for a character.


Detailed Description

C API: Unicode Properties.

This C API provides low-level access to the Unicode Character Database. In addition to raw property values, some convenience functions calculate derived properties, for example for Java-style programming.

Unicode assigns each code point (not just assigned character) values for many properties. Most of them are simple boolean flags, or constants from a small enumerated list. For some properties, values are strings or other relatively more complex types.

For more information see "About the Unicode Character Database" (http://www.unicode.org/ucd/) and the ICU User Guide chapter on Properties (http://icu-project.org/userguide/properties.html).

Many functions are designed to match java.lang.Character functions. See the individual function documentation, and see the JDK 1.4 java.lang.Character documentation at http://java.sun.com/j2se/1.4/docs/api/java/lang/Character.html

There are also functions that provide easy migration from C/POSIX functions like isblank(). Their use is generally discouraged because the C/POSIX standards do not define their semantics beyond the ASCII range, which means that different implementations exhibit very different behavior. Instead, Unicode properties should be used directly.

There are also only a few, broad C/POSIX character classes, and they tend to be used for conflicting purposes. For example, the "isalpha()" class is sometimes used to determine word boundaries, while a more sophisticated approach would at least distinguish initial letters from continuation characters (the latter including combining marks). (In ICU, BreakIterator is the most sophisticated API for word boundaries.) Another example: There is no "istitle()" class for titlecase characters.

ICU 3.4 and later provides API access for all twelve C/POSIX character classes. ICU implements them according to the Standard Recommendations in Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/#Compatibility_Properties).

API access for C/POSIX character classes is as follows:

Note: Some of the u_isxyz() functions in uchar.h predate, and do not match, the Standard Recommendations in UTS #18. Instead, they match Java functions according to their API documentation.

The C/POSIX character classes are also available in UnicodeSet patterns, using patterns like [:graph:] or \p{graph}.

Note: There are several ICU whitespace functions. Comparison:

Definition in file uchar.h.


Define Documentation

#define U_FOLD_CASE_DEFAULT   0

Option value for case folding: use default mappings defined in CaseFolding.txt.

Stable:
ICU 2.0

Definition at line 2887 of file uchar.h.

#define U_FOLD_CASE_EXCLUDE_SPECIAL_I   1

Option value for case folding:.

Use the modified set of mappings provided in CaseFolding.txt to handle dotted I and dotless i appropriately for Turkic languages (tr, az).

Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that are to be included for default mappings and excluded for the Turkic-specific mappings.

Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that are to be excluded for default mappings and included for the Turkic-specific mappings.

Stable:
ICU 2.0

Definition at line 2905 of file uchar.h.

#define U_GC_C_MASK   (U_GC_CN_MASK|U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CO_MASK|U_GC_CS_MASK)

Mask constant for multiple UCharCategory bits (C Others).

Stable:
ICU 2.1

Definition at line 749 of file uchar.h.

#define U_GC_CC_MASK   U_MASK(U_CONTROL_CHAR)

Mask constant for a UCharCategory.

Stable:
ICU 2.1

Definition at line 697 of file uchar.h.

#define U_GC_CF_MASK   U_MASK(U_FORMAT_CHAR)

Mask constant for a UCharCategory.

Stable:
ICU 2.1

Definition at line 699 of file uchar.h.

#define U_GC_CN_MASK   U_MASK(U_GENERAL_OTHER_TYPES)

U_GC_XX_MASK constants are bit flags corresponding to Unicode general category values.

For each category, the nth bit is set if the numeric value of the corresponding UCharCategory constant is n.

There are also some U_GC_Y_MASK constants for groups of general categories like L for all letter categories.

See also:
u_charType

U_GET_GC_MASK

UCharCategory

Stable:
ICU 2.1

Definition at line 662 of file uchar.h.

#define U_GC_CO_MASK   U_MASK(U_PRIVATE_USE_CHAR)

Mask constant for a UCharCategory.

Stable:
ICU 2.1

Definition at line 701 of file uchar.h.

#define U_GC_CS_MASK   U_MASK(U_SURROGATE)

Mask constant for a UCharCategory.

Stable:
ICU 2.1

Definition at line 703 of file uchar.h.

#define U_GC_L_MASK   (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK|U_GC_LM_MASK|U_GC_LO_MASK)

Mask constant for multiple UCharCategory bits (L Letters).

Stable:
ICU 2.1

Definition at line 732 of file uchar.h.

#define U_GC_LC_MASK   (U_GC_LU_MASK|U_GC_LL_MASK|U_GC_LT_MASK)

Mask constant for multiple UCharCategory bits (LC Cased Letters).

Stable:
ICU 2.1

Definition at line 736 of file uchar.h.

#define U_GC_LL_MASK   U_MASK(U_LOWERCASE_LETTER)

Mask constant for a UCharCategory.

Stable:
ICU 2.1

Definition at line 667 of file uchar.h.

#define U_GC_LM_MASK   U_MASK(U_MODIFIER_LETTER)

Mask constant for a UCharCategory.

Stable:
ICU 2.1

Definition at line 671 of file uchar.h.

#define U_GC_LO_MASK   U_MASK(U_OTHER_LETTER)

Mask constant for a UCharCategory.