#include "unicode/ucnv_err.h"
#include "unicode/uenum.h"
Go to the source code of this file.
Defines | |
| #define | UCNV_MAX_CONVERTER_NAME_LENGTH 60 |
| Maximum length of a converter name including the terminating NULL. | |
| #define | UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH) |
| Maximum length of a converter name including path and terminating NULL. | |
| #define | UCNV_SI 0x0F |
| Shift in for EBDCDIC_STATEFUL and iso2022 states. | |
| #define | UCNV_SO 0x0E |
| Shift out for EBDCDIC_STATEFUL and iso2022 states. | |
| #define | UCNV_OPTION_SEP_CHAR ',' |
| Character that separates converter names from options and options from each other. | |
| #define | UCNV_OPTION_SEP_STRING "," |
| String version of UCNV_OPTION_SEP_CHAR. | |
| #define | UCNV_VALUE_SEP_CHAR '=' |
| Character that separates a converter option from its value. | |
| #define | UCNV_VALUE_SEP_STRING "=" |
| String version of UCNV_VALUE_SEP_CHAR. | |
| #define | UCNV_LOCALE_OPTION_STRING ",locale=" |
| Converter option for specifying a locale. | |
| #define | UCNV_VERSION_OPTION_STRING ",version=" |
| Converter option for specifying a version selector (0. | |
| #define | UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl" |
| Converter option for EBCDIC SBCS or mixed-SBCS/DBCS (stateful) codepages. | |
| #define | U_CNV_SAFECLONE_BUFFERSIZE 1024 |
| Definition of a buffer size that is designed to be large enough for converters to be cloned with ucnv_safeClone(). | |
| #define | UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) (((int32_t)(length)+10)*(int32_t)(maxCharSize)) |
| Calculates the size of a buffer for conversion from Unicode to a charset. | |
Typedefs | |
| typedef struct USet | USet |
| typedef UConverterToUnicodeArgs * | args |
|
typedef UConverterToUnicodeArgs const char * | codeUnits |
|
typedef UConverterToUnicodeArgs const char int32_t | length |
|
typedef UConverterToUnicodeArgs const char int32_t UConverterCallbackReason | reason |
|
typedef UConverterToUnicodeArgs const char int32_t UConverterCallbackReason UErrorCode * | pErrorCode |
|
typedef UConverterFromUnicodeArgs const UChar int32_t UChar32 | codePoint |
Enumerations | |
| enum | UConverterType { UCNV_UNSUPPORTED_CONVERTER = -1, UCNV_SBCS = 0, UCNV_DBCS = 1, UCNV_MBCS = 2, UCNV_LATIN_1 = 3, UCNV_UTF8 = 4, UCNV_UTF16_BigEndian = 5, UCNV_UTF16_LittleEndian = 6, UCNV_UTF32_BigEndian = 7, UCNV_UTF32_LittleEndian = 8, UCNV_EBCDIC_STATEFUL = 9, UCNV_ISO_2022 = 10, UCNV_LMBCS_1 = 11, UCNV_LMBCS_2, UCNV_LMBCS_3, UCNV_LMBCS_4, UCNV_LMBCS_5, UCNV_LMBCS_6, UCNV_LMBCS_8, UCNV_LMBCS_11, UCNV_LMBCS_16, UCNV_LMBCS_17, UCNV_LMBCS_18, UCNV_LMBCS_19, UCNV_LMBCS_LAST = UCNV_LMBCS_19, UCNV_HZ, UCNV_SCSU, UCNV_ISCII, UCNV_US_ASCII, UCNV_UTF7, UCNV_BOCU1, UCNV_UTF16, UCNV_UTF32, UCNV_CESU8, UCNV_IMAP_MAILBOX, UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES } |
| Enum for specifying basic types of converters. More... | |
| enum | UConverterPlatform { UCNV_UNKNOWN = -1, UCNV_IBM = 0 } |
| Enum for specifying which platform a converter ID refers to. More... | |
| enum | UConverterUnicodeSet { UCNV_ROUNDTRIP_SET, UCNV_ROUNDTRIP_AND_FALLBACK_SET, UCNV_SET_COUNT } |
| Selectors for Unicode sets that can be returned by ucnv_getUnicodeSet(). More... | |
Functions | |
| typedef | void (1 *UConverterToUCallback)(const void *context |
| Function pointer for error callback in the codepage to unicode direction. | |
| int | ucnv_compareNames (const char *name1, const char *name2) |
| Do a fuzzy compare of two converter/alias names. | |
| UConverter * | ucnv_open (const char *converterName, UErrorCode *err) |
| Creates a UConverter object with the name of a coded character set specified as a C string. | |
| UConverter * | ucnv_openU (const UChar *name, UErrorCode *err) |
| Creates a Unicode converter with the names specified as unicode string. | |
| UConverter * | ucnv_openCCSID (int32_t codepage, UConverterPlatform platform, UErrorCode *err) |
| Creates a UConverter object from a CCSID number and platform pair. | |
| UConverter * | ucnv_openPackage (const char *packageName, const char *converterName, UErrorCode *err) |
| UConverter * | ucnv_safeClone (const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) |
| Thread safe converter cloning operation. | |
| void | ucnv_close (UConverter *converter) |
| Deletes the unicode converter and releases resources associated with just this instance. | |
| void | ucnv_getSubstChars (const UConverter *converter, char *subChars, int8_t *len, UErrorCode *err) |
| Fills in the output parameter, subChars, with the substitution characters as multiple bytes. | |
| void | ucnv_setSubstChars (UConverter *converter, const char *subChars, int8_t len, UErrorCode *err) |
| Sets the substitution chars when converting from unicode to a codepage. | |
| void | ucnv_setSubstString (UConverter *cnv, const UChar *s, int32_t length, UErrorCode *err) |
| Set a substitution string for converting from Unicode to a charset. | |
| void | ucnv_getInvalidChars (const UConverter *converter, char *errBytes, int8_t *len, UErrorCode *err) |
| Fills in the output parameter, errBytes, with the error characters from the last failing conversion. | |
| void | ucnv_getInvalidUChars (const UConverter *converter, UChar *errUChars, int8_t *len, UErrorCode *err) |
| Fills in the output parameter, errChars, with the error characters from the last failing conversion. | |
| void | ucnv_reset (UConverter *converter) |
| Resets the state of a converter to the default state. | |
| void | ucnv_resetToUnicode (UConverter *converter) |
| Resets the to-Unicode part of a converter state to the default state. | |
| void | ucnv_resetFromUnicode (UConverter *converter) |
| Resets the from-Unicode part of a converter state to the default state. | |
| int8_t | ucnv_getMaxCharSize (const UConverter *converter) |
| Returns the maximum number of bytes that are output per UChar in conversion from Unicode using this converter. | |
| int8_t | ucnv_getMinCharSize (const UConverter *converter) |
| Returns the minimum byte length for characters in this codepage. | |
| int32_t | ucnv_getDisplayName (const UConverter *converter, const char *displayLocale, UChar *displayName, int32_t displayNameCapacity, UErrorCode *err) |
| Returns the display name of the converter passed in based on the Locale passed in. | |
| const char * | ucnv_getName (const UConverter *converter, UErrorCode *err) |
| Gets the internal, canonical name of the converter (zero-terminated). | |
| int32_t | ucnv_getCCSID (const UConverter *converter, UErrorCode *err) |
| Gets a codepage number associated with the converter. | |
| UConverterPlatform | ucnv_getPlatform (const UConverter *converter, UErrorCode *err) |
| Gets a codepage platform associated with the converter. | |
| UConverterType | ucnv_getType (const UConverter *converter) |
| Gets the type of the converter e.g. | |
| void | ucnv_getStarters (const UConverter *converter, UBool starters[256], UErrorCode *err) |
| Gets the "starter" (lead) bytes for converters of type MBCS. | |
| void | ucnv_getUnicodeSet (const UConverter *cnv, USet *setFillIn, UConverterUnicodeSet whichSet, UErrorCode *pErrorCode) |
| Returns the set of Unicode code points that can be converted by an ICU converter. | |
| void | ucnv_getToUCallBack (const UConverter *converter, UConverterToUCallback *action, const void **context) |
| Gets the current calback function used by the converter when an illegal or invalid codepage sequence is found. | |
| void | ucnv_getFromUCallBack (const UConverter *converter, UConverterFromUCallback *action, const void **context) |
| Gets the current callback function used by the converter when illegal or invalid Unicode sequence is found. | |
| void | ucnv_setToUCallBack (UConverter *converter, UConverterToUCallback newAction, const void *newContext, UConverterToUCallback *oldAction, const void **oldContext, UErrorCode *err) |
| Changes the callback function used by the converter when an illegal or invalid sequence is found. | |
| void | ucnv_setFromUCallBack (UConverter *converter, UConverterFromUCallback newAction, const void *newContext, UConverterFromUCallback *oldAction, const void **oldContext, UErrorCode *err) |
| Changes the current callback function used by the converter when an illegal or invalid sequence is found. | |
| void | ucnv_fromUnicode (UConverter *converter, char **target, const char *targetLimit, const UChar **source, const UChar *sourceLimit, int32_t *offsets, UBool flush, UErrorCode *err) |
| Converts an array of unicode characters to an array of codepage characters. | |
| void | ucnv_toUnicode (UConverter *converter, UChar **target, const UChar *targetLimit, const char **source, const char *sourceLimit, int32_t *offsets, UBool flush, UErrorCode *err) |
| Converts a buffer of codepage bytes into an array of unicode UChars characters. | |
| int32_t | ucnv_fromUChars (UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) |
| Convert the Unicode string into a codepage string using an existing UConverter. | |
| int32_t | ucnv_toUChars (UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) |
| Convert the codepage string into a Unicode string using an existing UConverter. | |
| UChar32 | ucnv_getNextUChar (UConverter *converter, const char **source, const char *sourceLimit, UErrorCode *err) |
| Convert a codepage buffer into Unicode one character at a time. | |
| void | ucnv_convertEx (UConverter *targetCnv, UConverter *sourceCnv, char **target, const char *targetLimit, const char **source, const char *sourceLimit, UChar *pivotStart, UChar **pivotSource, UChar **pivotTarget, const UChar *pivotLimit, UBool reset, UBool flush, UErrorCode *pErrorCode) |
| Convert from one external charset to another using two existing UConverters. | |
| int32_t | ucnv_convert (const char *toConverterName, const char *fromConverterName, char *target, int32_t targetCapacity, const char *source, int32_t sourceLength, UErrorCode *pErrorCode) |
| Convert from one external charset to another. | |
| int32_t | ucnv_toAlgorithmic (UConverterType algorithmicType, UConverter *cnv, char *target, int32_t targetCapacity, const char *source, int32_t sourceLength, UErrorCode *pErrorCode) |
| Convert from one external charset to another. | |
| int32_t | ucnv_fromAlgorithmic (UConverter *cnv, UConverterType algorithmicType, char *target, int32_t targetCapacity, const char *source, int32_t sourceLength, UErrorCode *pErrorCode) |
| Convert from one external charset to another. | |
| int32_t | ucnv_flushCache (void) |
| Frees up memory occupied by unused, cached converter shared data. | |
| int32_t | ucnv_countAvailable (void) |
| Returns the number of available converters, as per the alias file. | |
| const char * | ucnv_getAvailableName (int32_t n) |
| Gets the canonical converter name of the specified converter from a list of all available converters contaied in the alias file. | |
| UEnumeration * | ucnv_openAllNames (UErrorCode *pErrorCode) |
| Returns a UEnumeration to enumerate all of the canonical converter names, as per the alias file, regardless of the ability to open each converter. | |
| uint16_t | ucnv_countAliases (const char *alias, UErrorCode *pErrorCode) |
| Gives the number of aliases for a given converter or alias name. | |
| const char * | ucnv_getAlias (const char *alias, uint16_t n, UErrorCode *pErrorCode) |
| Gives the name of the alias at given index of alias list. | |
| void | ucnv_getAliases (const char *alias, const char **aliases, UErrorCode *pErrorCode) |
| Fill-up the list of alias names for the given alias. | |
| UEnumeration * | ucnv_openStandardNames (const char *convName, const char *standard, UErrorCode *pErrorCode) |
| Return a new UEnumeration object for enumerating all the alias names for a given converter that are recognized by a standard. | |
| uint16_t | ucnv_countStandards (void) |
| Gives the number of standards associated to converter names. | |
| const char * | ucnv_getStandard (uint16_t n, UErrorCode *pErrorCode) |
| Gives the name of the standard at given index of standard list. | |
| const char * | ucnv_getStandardName (const char *name, const char *standard, UErrorCode *pErrorCode) |
| Returns a standard name for a given converter name. | |
| const char * | ucnv_getCanonicalName (const char *alias, const char *standard, UErrorCode *pErrorCode) |
| This function will return the internal canonical converter name of the tagged alias. | |
| const char * | ucnv_getDefaultName (void) |
| Returns the current default converter name. | |
| void | ucnv_setDefaultName (const char *name) |
| This function is not thread safe. | |
| void | ucnv_fixFileSeparator (const UConverter *cnv, UChar *source, int32_t sourceLen) |
| Fixes the backslash character mismapping. | |
| UBool | ucnv_isAmbiguous (const UConverter *cnv) |
| Determines if the converter contains ambiguous mappings of the same character or not. | |
| void | ucnv_setFallback (UConverter *cnv, UBool usesFallback) |
| Sets the converter to use fallback mappings or not. | |
| UBool | ucnv_usesFallback (const UConverter *cnv) |
| Determines if the converter uses fallback mappings or not. | |
| const char * | ucnv_detectUnicodeSignature (const char *source, int32_t sourceLength, int32_t *signatureLength, UErrorCode *pErrorCode) |
| Detects Unicode signature byte sequences at the start of the byte stream and returns the charset name of the indicated Unicode charset. | |
| int32_t | ucnv_fromUCountPending (const UConverter *cnv, UErrorCode *status) |
| Returns the number of UChars held in the converter's internal state because more input is needed for completing the conversion. | |
| int32_t | ucnv_toUCountPending (const UConverter *cnv, UErrorCode *status) |
| Returns the number of chars held in the converter's internal state because more input is needed for completing the conversion. | |
This API is used to convert codepage or character encoded data to and from UTF-16. You can open a converter with ucnv_open(). With that converter, you can get its properties, set options, convert your data and close the converter.
Since many software programs recogize different converter names for different types of converters, there are other functions in this API to iterate over the converter aliases. The functions ucnv_getAvailableName(), ucnv_getAlias() and ucnv_getStandardName() are some of the more frequently used alias functions to get this information.
When a converter encounters an illegal, irregular, invalid or unmappable character its default behavior is to use a substitution character to replace the bad byte sequence. This behavior can be changed by using ucnv_setFromUCallBack() or ucnv_setToUCallBack() on the converter. The header ucnv_err.h defines many other callback actions that can be used instead of a character substitution.
More information about this API can be found in our User's Guide.
Definition in file ucnv.h.
| #define U_CNV_SAFECLONE_BUFFERSIZE 1024 |
Definition of a buffer size that is designed to be large enough for converters to be cloned with ucnv_safeClone().
| #define UCNV_GET_MAX_BYTES_FOR_STRING | ( | length, | |||
| maxCharSize | ) | (((int32_t)(length)+10)*(int32_t)(maxCharSize)) |
Calculates the size of a buffer for conversion from Unicode to a charset.
The calculated size is guaranteed to be sufficient for this conversion.
It takes into account initial and final non-character bytes that are output by some converters. It does not take into account callbacks which output more than one charset character sequence per call, like escape callbacks. The default (substitution) callback only outputs one charset character sequence.
| length | Number of UChars to be converted. | |
| maxCharSize | Return value from ucnv_getMaxCharSize() for the converter that will be used. |
| #define UCNV_LOCALE_OPTION_STRING ",locale=" |
| #define UCNV_MAX_CONVERTER_NAME_LENGTH 60 |
| #define UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH) |
| #define UCNV_OPTION_SEP_CHAR ',' |
| #define UCNV_OPTION_SEP_STRING "," |
| #define UCNV_SI 0x0F |
| #define UCNV_SO 0x0E |
| #define UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl" |
| #define UCNV_VALUE_SEP_CHAR '=' |
| #define UCNV_VALUE_SEP_STRING "=" |
| #define UCNV_VERSION_OPTION_STRING ",version=" |
| enum UConverterPlatform |
Enum for specifying which platform a converter ID refers to.
The use of platform/CCSID is not recommended. See ucnv_openCCSID().
| enum UConverterType |
| enum UConverterUnicodeSet |
Selectors for Unicode sets that can be returned by ucnv_getUnicodeSet().
| UCNV_ROUNDTRIP_SET |
Select the set of roundtrippable Unicode code points.
|
| UCNV_ROUNDTRIP_AND_FALLBACK_SET |
Select the set of Unicode code points with roundtrip or fallback mappings.
|
| UCNV_SET_COUNT |
Number of UConverterUnicodeSet selectors.
|
| void ucnv_close | ( | UConverter * | converter | ) |
| int ucnv_compareNames | ( | const char * | name1, | |
| const char * | name2 | |||
| ) |
Do a fuzzy compare of two converter/alias names.
The comparison is case-insensitive, ignores leading zeroes if they are not followed by further digits, and ignores all but letters and digits. Thus the strings "UTF-8", "utf_8", "u*T@f08" and "Utf 8" are exactly equivalent. See section 1.4, Charset Alias Matching in Unicode Technical Standard #22 at http://www.unicode.org/reports/tr22/
| name1 | a converter name or alias, zero-terminated | |
| name2 | a converter name or alias, zero-terminated |
| int32_t ucnv_convert | ( | const char * | toConverterName, | |
| const char * | fromConverterName, | |||
| char * | target, | |||
| int32_t | targetCapacity, | |||
| const char * | source, | |||
| int32_t | sourceLength, | |||
| UErrorCode * | pErrorCode | |||
| ) |
Convert from one external charset to another.
Internally, two converters are opened according to the name arguments, then the text is converted to and from the 16-bit Unicode "pivot" using ucnv_convertEx(), then the converters are closed again.
This is a convenience function, not an efficient way to convert a lot of text: ucnv_convert()
The function returns when one of the following is true:
| toConverterName | The name of the converter that is used to convert from the UTF-16 pivot buffer to the target. | |
| fromConverterName | The name of the converter that is used to convert from the source to the UTF-16 pivot buffer. | |
| target | Pointer to the output buffer. | |
| targetCapacity | Capacity of the target, in bytes. | |
| source | Pointer to the input buffer. | |
| sourceLength | Length of the input text, in bytes, or -1 for NUL-terminated input. | |
| pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
| void ucnv_convertEx | ( | UConverter * | targetCnv, | |
| UConverter * | sourceCnv, | |||
| char ** | target, | |||
| const char * | targetLimit, | |||
| const char ** | source, | |||
| const char * | sourceLimit, | |||
| UChar * | pivotStart, | |||
| UChar ** | pivotSource, | |||
| UChar ** | pivotTarget, | |||
| const UChar * | pivotLimit, | |||
| UBool | reset, | |||
| UBool | flush, | |||
| UErrorCode * | pErrorCode | |||
| ) |
Convert from one external charset to another using two existing UConverters.
Internally, two conversions - ucnv_toUnicode() and ucnv_fromUnicode() - are used, "pivoting" through 16-bit Unicode.
Important: For streaming conversion (multiple function calls for successive parts of a text stream), the caller must provide a pivot buffer explicitly, and must preserve the pivot buffer and associated pointers from one call to another. (The buffer may be moved if its contents and the relative pointer positions are preserved.)
There is a similar function, ucnv_convert(), which has the following limitations:
By contrast, ucnv_convertEx()
ucnv_convertEx() also provides further convenience:
The function returns when one of the following is true:
Limitation compared to the direct use of ucnv_fromUnicode() and ucnv_toUnicode(): ucnv_convertEx() does not provide offset information.
Limitation compared to ucnv_fromUChars() and ucnv_toUChars(): ucnv_convertEx() does not support preflighting directly.
Sample code for converting a single string from one external charset to UTF-8, ignoring the location of errors:
int32_t myToUTF8(UConverter *cnv, const char *s, int32_t length, char *u8, int32_t capacity, UErrorCode *pErrorCode) { UConverter *utf8Cnv; char *target; if(U_FAILURE(*pErrorCode)) { return 0; } utf8Cnv=myGetCachedUTF8Converter(pErrorCode); if(U_FAILURE(*pErrorCode)) { return 0; } if(length<0) { length=strlen(s); } target=u8; ucnv_convertEx(cnv, utf8Cnv, &target, u8+capacity, &s, s+length, NULL, NULL, NULL, NULL, TRUE, TRUE, pErrorCode); myReleaseCachedUTF8Converter(utf8Cnv); // return the output string length, but without preflighting return (int32_t)(target-u8); }
| targetCnv | Output converter, used to convert from the UTF-16 pivot to the target using ucnv_fromUnicode(). | |
| sourceCnv | Input converter, used to convert from the source to the UTF-16 pivot using ucnv_toUnicode(). | |
| target | I/O parameter, same as for ucnv_fromUChars(). Input: *target points to the beginning of the target buffer. Output: *target points to the first unit after the last char written. | |
| targetLimit | Pointer to the first unit after the target buffer. | |
| source | I/O parameter, same as for ucnv_toUChars(). Input: *source points to the beginning of the source buffer. Output: *source points to the first unit after the last char read. | |
| sourceLimit | Pointer to the first unit after the source buffer. | |
| pivotStart | Pointer to the UTF-16 pivot buffer. If pivotStart==NULL, then an internal buffer is used and the other pivot arguments are ignored and can be NULL as well. | |
| pivotSource | I/O parameter, same as source in ucnv_fromUChars() for conversion from the pivot buffer to the target buffer. | |
| pivotTarget | I/O parameter, same as target in ucnv_toUChars() for conversion from the source buffer to the pivot buffer. It must be pivotStart<=*pivotSource<=*pivotTarget<=pivotLimit and pivotStart<pivotLimit (unless pivotStart==NULL). | |
| pivotLimit | Pointer to the first unit after the pivot buffer. | |
| reset | If TRUE, then ucnv_resetToUnicode(sourceCnv) and ucnv_resetFromUnicode(targetCnv) are called, and the pivot pointers are reset (*pivotTarget=*pivotSource=pivotStart). | |
| flush | If true, indicates the end of the input. Passed directly to ucnv_toUnicode(), and carried over to ucnv_fromUnicode() when the source is empty as well. | |
| pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. U_BUFFER_OVERFLOW_ERROR always refers to the target buffer because overflows into the pivot buffer are handled internally. Other conversion errors are from the source-to-pivot conversion if *pivotSource==pivotStart, otherwise from the pivot-to-target conversion. |
| uint16_t ucnv_countAliases | ( | const char * | alias, | |
| UErrorCode * | pErrorCode | |||
| ) |
Gives the number of aliases for a given converter or alias name.
If the alias is ambiguous, then the preferred converter is used and the status is set to U_AMBIGUOUS_ALIAS_WARNING. This method only enumerates the listed entries in the alias file.
| alias | alias name | |
| pErrorCode | error status |
| int32_t ucnv_countAvailable | ( | void | ) |
Returns the number of available converters, as per the alias file.
| uint16_t ucnv_countStandards | ( | void | ) |
Gives the number of standards associated to converter names.
| const char* ucnv_detectUnicodeSignature | ( | const char * | source, | |
| int32_t | sourceLength, | |||
| int32_t * | signatureLength, | |||
| UErrorCode * | pErrorCode | |||
| ) |
Detects Unicode signature byte sequences at the start of the byte stream and returns the charset name of the indicated Unicode charset.
NULL is returned when no Unicode signature is recognized. The number of bytes in the signature is output as well.
The caller can ucnv_open() a converter using the charset name. The first code unit (UChar) from the start of the stream will be U+FEFF (the Unicode BOM/signature character) and can usually be ignored.
For most Unicode charsets it is also possible to ignore the indicated number of initial stream bytes and start converting after them. However, there are stateful Unicode charsets (UTF-7 and BOCU-1) for which this will not work. Therefore, it is best to ignore the first output UChar instead of the input signature bytes.
Usage:
UErrorCode err = U_ZERO_ERROR; char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; int32_t signatureLength = 0; char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); UConverter *conv = NULL; UChar output[100]; UChar *target = output, *out; char *source = input; if(encoding!=NULL && U_SUCCESS(err)){ // should signature be discarded ? conv = ucnv_open(encoding, &err); // do the conversion ucnv_toUnicode(conv, target, output + sizeof(output)/U_SIZEOF_UCHAR, source, input + sizeof(input), NULL, TRUE, &err); out = output; if (discardSignature){ ++out; // ignore initial U+FEFF } while(out != target) { printf("%04x ", *out++); } puts(""); }
| source | The source string in which the signature should be detected. | |
| sourceLength | Length of the input string, or -1 if terminated with a NUL byte. | |
| signatureLength | A pointer to int32_t to receive the number of bytes that make up the signature of the detected UTF. 0 if not detected. Can be a NULL pointer. | |
| pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
| void ucnv_fixFileSeparator | ( | const UConverter * | cnv, | |
| UChar * | source, | |||
| int32_t | sourceLen | |||
| ) |
Fixes the backslash character mismapping.
For example, in SJIS, the backslash character in the ASCII portion is also used to represent the yen currency sign. When mapping from Unicode character 0x005C, it's unclear whether to map the character back to yen or backslash in SJIS. This function will take the input buffer and replace all the yen sign characters with backslash. This is necessary when the user tries to open a file with the input buffer on Windows. This function will test the converter to see whether such mapping is required. You can sometimes avoid using this function by using the correct version of Shift-JIS.
| cnv | The converter representing the target codepage. | |
| source | the input buffer to be fixed | |
| sourceLen | the length of the input buffer |
| int32_t ucnv_flushCache | ( | void | ) |
Frees up memory occupied by unused, cached converter shared data.
| int32_t ucnv_fromAlgorithmic | ( | UConverter * | cnv, | |
| UConverterType | algorithmicType, | |||
| char * | target, | |||
| int32_t | targetCapacity, | |||
| const char * | source, | |||
| int32_t | sourceLength, | |||
| UErrorCode * | pErrorCode | |||
| ) |
Convert from one external charset to another.
Internally, the text is converted to and from the 16-bit Unicode "pivot" using ucnv_convertEx(). ucnv_fromAlgorithmic() works exactly like ucnv_convert() except that the two converters need not be looked up and opened completely.
The source-to-pivot conversion uses a purely algorithmic converter according to the specified type, e.g., UCNV_UTF8 for a UTF-8 converter. The pivot-to-target conversion uses the cnv converter parameter.
Internally, the algorithmic converter is opened and closed for each function call, which is more efficient than using the public ucnv_open() but somewhat less efficient than only resetting an existing converter and using ucnv_convertEx().
This function is more convenient than ucnv_convertEx() for single-string conversions, especially when "preflighting" is desired (returning the length of the complete output even if it does not fit into the target buffer; see the User Guide Strings chapter). See ucnv_convert() for details.
| cnv | The converter that is used to convert from the UTF-16 pivot buffer to the target. | |
| algorithmicType | UConverterType constant identifying the desired source charset as a purely algorithmic converter. Those are converters for Unicode charsets like UTF-8, BOCU-1, SCSU, UTF-7, IMAP-mailbox-name, etc., as well as US-ASCII and ISO-8859-1. | |
| target | Pointer to the output buffer. | |
| targetCapacity | Capacity of the target, in bytes. | |
| source | Pointer to the input buffer. | |
| sourceLength | Length of the input text, in bytes | |
| pErrorCode | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
| int32_t ucnv_fromUChars | ( | UConverter * | cnv, | |
| char * | dest, | |||
| int32_t | destCapacity, | |||
| const UChar * | src, | |||
| int32_t | srcLength, | |||
| UErrorCode * | pErrorCode | |||
| ) |
Convert the Unicode string into a codepage string using an existing UConverter.
The output string is NUL-terminated if possible.
This function is a more convenient but less powerful version of ucnv_fromUnicode(). It is only useful for whole strings, not for streaming conversion.
The maximum output buffer capacity required (barring output from callbacks) will be UCNV_GET_MAX_BYTES_FOR_STRING(srcLength, ucnv_getMaxCharSize(cnv)).
| cnv | the converter object to be used (ucnv_resetFromUnicode() will be called) | |
| src | the input Unicode string | |
| srcLength | the input string length, or -1 if NUL-terminated | |
| dest | destination string buffer, can be NULL if destCapacity==0 | |
| destCapacity | the number of chars available at dest | |
| pErrorCode | normal ICU error code; common error codes that may be set by this function include U_BUFFER_OVERFLOW_ERROR, U_STRING_NOT_TERMINATED_WARNING, U_ILLEGAL_ARGUMENT_ERROR, and conversion errors |
| int32_t ucnv_fromUCountPending | ( | const UConverter * | cnv, | |
| UErrorCode * | status | |||
| ) |
Returns the number of UChars held in the converter's internal state because more input is needed for completing the conversion.
This function is useful for mapping semantics of ICU's converter interface to those of iconv, and this information is not needed for normal conversion.
| cnv | The converter in which the input is held | |
| status | ICU error code in/out parameter. Must fulfill U_SUCCESS before the function call. |
| void ucnv_fromUnicode | ( | UConverter * | converter, | |
| char ** | target, | |||
| const char * | targetLimit, | |||
| const UChar ** | source, | |||
| const UChar * | sourceLimit, | |||
| int32_t * | offsets, | |||
| UBool | flush, | |||
| UErrorCode * | err | |||
| ) |
Converts an array of unicode characters to an array of codepage characters.
This function is optimized for converting a continuous stream of data in buffer-sized chunks, where the entire source and target does not fit in available buffers.
The source pointer is an in/out parameter. It starts out pointing where the conversion is to begin, and ends up pointing after the last UChar consumed.
Target similarly starts ou