ICU 62.1  62.1
normalizer2.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
27 #include "unicode/utypes.h"
28 
29 #if !UCONFIG_NO_NORMALIZATION
30 
31 #include "unicode/stringpiece.h"
32 #include "unicode/uniset.h"
33 #include "unicode/unistr.h"
34 #include "unicode/unorm2.h"
35 
37 
38 class ByteSink;
39 
84 public:
89  ~Normalizer2();
90 
102  static const Normalizer2 *
103  getNFCInstance(UErrorCode &errorCode);
104 
116  static const Normalizer2 *
117  getNFDInstance(UErrorCode &errorCode);
118 
130  static const Normalizer2 *
131  getNFKCInstance(UErrorCode &errorCode);
132 
144  static const Normalizer2 *
145  getNFKDInstance(UErrorCode &errorCode);
146 
158  static const Normalizer2 *
159  getNFKCCasefoldInstance(UErrorCode &errorCode);
160 
182  static const Normalizer2 *
183  getInstance(const char *packageName,
184  const char *name,
185  UNormalization2Mode mode,
186  UErrorCode &errorCode);
187 
199  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
200  UnicodeString result;
201  normalize(src, result, errorCode);
202  return result;
203  }
217  virtual UnicodeString &
218  normalize(const UnicodeString &src,
219  UnicodeString &dest,
220  UErrorCode &errorCode) const = 0;
221 
246  virtual void
247  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
248  Edits *edits, UErrorCode &errorCode) const;
249 
264  virtual UnicodeString &
265  normalizeSecondAndAppend(UnicodeString &first,
266  const UnicodeString &second,
267  UErrorCode &errorCode) const = 0;
282  virtual UnicodeString &
283  append(UnicodeString &first,
284  const UnicodeString &second,
285  UErrorCode &errorCode) const = 0;
286 
300  virtual UBool
301  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
302 
327  virtual UBool
328  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
329 
345  virtual UChar32
346  composePair(UChar32 a, UChar32 b) const;
347 
356  virtual uint8_t
357  getCombiningClass(UChar32 c) const;
358 
373  virtual UBool
374  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
396  virtual UBool
397  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
398 
399 
416  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
417 
440  virtual int32_t
441  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
442 
456  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
457 
472  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
473 
487  virtual UBool isInert(UChar32 c) const = 0;
488 };
489 
502 public:
513  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
514  norm2(n2), set(filterSet) {}
515 
521 
535  virtual UnicodeString &
536  normalize(const UnicodeString &src,
537  UnicodeString &dest,
538  UErrorCode &errorCode) const U_OVERRIDE;
539 
564  virtual void
565  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
566  Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
567 
582  virtual UnicodeString &
584  const UnicodeString &second,
585  UErrorCode &errorCode) const U_OVERRIDE;
600  virtual UnicodeString &
601  append(UnicodeString &first,
602  const UnicodeString &second,
603  UErrorCode &errorCode) const U_OVERRIDE;
604 
616  virtual UBool
617  getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
618 
630  virtual UBool
631  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
632 
643  virtual UChar32
644  composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
645 
654  virtual uint8_t
655  getCombiningClass(UChar32 c) const U_OVERRIDE;
656 
668  virtual UBool
669  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
691  virtual UBool
692  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
705  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
717  virtual int32_t
718  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
719 
728  virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
729 
738  virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
739 
747  virtual UBool isInert(UChar32 c) const U_OVERRIDE;
748 private:
749  UnicodeString &
750  normalize(const UnicodeString &src,
751  UnicodeString &dest,
752  USetSpanCondition spanCondition,
753  UErrorCode &errorCode) const;
754 
755  void
756  normalizeUTF8(uint32_t options, const char *src, int32_t length,
757  ByteSink &sink, Edits *edits,
758  USetSpanCondition spanCondition,
759  UErrorCode &errorCode) const;
760 
761  UnicodeString &
763  const UnicodeString &second,
764  UBool doNormalize,
765  UErrorCode &errorCode) const;
766 
767  const Normalizer2 &norm2;
768  const UnicodeSet &set;
769 };
770 
772 
773 #endif // !UCONFIG_NO_NORMALIZATION
774 #endif // __NORMALIZER2_H__
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context...
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
C++ API: Unicode String.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
A ByteSink can be filled with bytes.
Definition: bytestream.h:50
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:199
Records lengths of string edits but not replacement text.
Definition: edits.h:77
C++ API: StringPiece: Read-only byte string wrapper class.
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:137
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:83
C API: New API for Unicode Normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context...
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:400
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one. ...
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:513
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:278
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:152
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:138
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:45
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:396
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:359
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:286
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:54
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:501
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:94
int8_t UBool
The ICU boolean type.
Definition: umachine.h:236
C++ API: Unicode Set.