ICU 63.1  63.1
uniset.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ***************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 ***************************************************************************
11 */
12 
13 #ifndef UNICODESET_H
14 #define UNICODESET_H
15 
16 #include "unicode/ucpmap.h"
17 #include "unicode/unifilt.h"
18 #include "unicode/unistr.h"
19 #include "unicode/uset.h"
20 
27 
28 // Forward Declarations.
29 class BMPSet;
30 class CharacterProperties;
31 class ParsePosition;
32 class RBBIRuleScanner;
33 class SymbolTable;
34 class UnicodeSetStringSpan;
35 class UVector;
36 class RuleCharacterIterator;
37 
279 
280  int32_t len; // length of list used; 0 <= len <= capacity
281  int32_t capacity; // capacity of list
282  UChar32* list; // MUST be terminated with HIGH
283  BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
284  UChar32* buffer; // internal buffer, may be NULL
285  int32_t bufferCapacity; // capacity of buffer
286  int32_t patLen;
287 
297  char16_t *pat;
298  UVector* strings; // maintained in sorted order
299  UnicodeSetStringSpan *stringSpan;
300 
301 private:
302  enum { // constants
303  kIsBogus = 1 // This set is bogus (i.e. not valid)
304  };
305  uint8_t fFlags; // Bit flag (see constants above)
306 public:
316  inline UBool isBogus(void) const;
317 
334  void setToBogus();
335 
336 public:
337 
338  enum {
343  MIN_VALUE = 0,
344 
349  MAX_VALUE = 0x10ffff
350  };
351 
352  //----------------------------------------------------------------
353  // Constructors &c
354  //----------------------------------------------------------------
355 
356 public:
357 
362  UnicodeSet();
363 
372  UnicodeSet(UChar32 start, UChar32 end);
373 
374 #ifndef U_HIDE_INTERNAL_API
375 
379  kSerialized /* result of serialize() */
380  };
381 
392  UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
393  ESerialization serialization, UErrorCode &status);
394 #endif /* U_HIDE_INTERNAL_API */
395 
404  UnicodeSet(const UnicodeString& pattern,
405  UErrorCode& status);
406 
407 #ifndef U_HIDE_INTERNAL_API
408 
420  UnicodeSet(const UnicodeString& pattern,
421  uint32_t options,
422  const SymbolTable* symbols,
423  UErrorCode& status);
424 #endif /* U_HIDE_INTERNAL_API */
425 
439  UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
440  uint32_t options,
441  const SymbolTable* symbols,
442  UErrorCode& status);
443 
448  UnicodeSet(const UnicodeSet& o);
449 
454  virtual ~UnicodeSet();
455 
461  UnicodeSet& operator=(const UnicodeSet& o);
462 
474  virtual UBool operator==(const UnicodeSet& o) const;
475 
481  UBool operator!=(const UnicodeSet& o) const;
482 
492  virtual UnicodeFunctor* clone() const;
493 
501  virtual int32_t hashCode(void) const;
502 
511  inline static UnicodeSet *fromUSet(USet *uset);
512 
521  inline static const UnicodeSet *fromUSet(const USet *uset);
522 
530  inline USet *toUSet();
531 
532 
540  inline const USet * toUSet() const;
541 
542 
543  //----------------------------------------------------------------
544  // Freezable API
545  //----------------------------------------------------------------
546 
555  inline UBool isFrozen() const;
556 
570  UnicodeFunctor *freeze();
571 
580  UnicodeFunctor *cloneAsThawed() const;
581 
582  //----------------------------------------------------------------
583  // Public API
584  //----------------------------------------------------------------
585 
595  UnicodeSet& set(UChar32 start, UChar32 end);
596 
602  static UBool resemblesPattern(const UnicodeString& pattern,
603  int32_t pos);
604 
617  UnicodeSet& applyPattern(const UnicodeString& pattern,
618  UErrorCode& status);
619 
620 #ifndef U_HIDE_INTERNAL_API
621 
637  UnicodeSet& applyPattern(const UnicodeString& pattern,
638  uint32_t options,
639  const SymbolTable* symbols,
640  UErrorCode& status);
641 #endif /* U_HIDE_INTERNAL_API */
642 
674  UnicodeSet& applyPattern(const UnicodeString& pattern,
675  ParsePosition& pos,
676  uint32_t options,
677  const SymbolTable* symbols,
678  UErrorCode& status);
679 
693  virtual UnicodeString& toPattern(UnicodeString& result,
694  UBool escapeUnprintable = FALSE) const;
695 
718  UnicodeSet& applyIntPropertyValue(UProperty prop,
719  int32_t value,
720  UErrorCode& ec);
721 
751  UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
752  const UnicodeString& value,
753  UErrorCode& ec);
754 
763  virtual int32_t size(void) const;
764 
771  virtual UBool isEmpty(void) const;
772 
780  virtual UBool contains(UChar32 c) const;
781 
790  virtual UBool contains(UChar32 start, UChar32 end) const;
791 
799  UBool contains(const UnicodeString& s) const;
800 
808  virtual UBool containsAll(const UnicodeSet& c) const;
809 
817  UBool containsAll(const UnicodeString& s) const;
818 
827  UBool containsNone(UChar32 start, UChar32 end) const;
828 
836  UBool containsNone(const UnicodeSet& c) const;
837 
845  UBool containsNone(const UnicodeString& s) const;
846 
855  inline UBool containsSome(UChar32 start, UChar32 end) const;
856 
864  inline UBool containsSome(const UnicodeSet& s) const;
865 
873  inline UBool containsSome(const UnicodeString& s) const;
874 
893  int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
894 
907  inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
908 
926  int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
927 
941  inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
942 
961  int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
962 
980  int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
981 
986  virtual UMatchDegree matches(const Replaceable& text,
987  int32_t& offset,
988  int32_t limit,
989  UBool incremental);
990 
991 private:
1014  static int32_t matchRest(const Replaceable& text,
1015  int32_t start, int32_t limit,
1016  const UnicodeString& s);
1017 
1027  int32_t findCodePoint(UChar32 c) const;
1028 
1029 public:
1030 
1038  virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1039 
1048  int32_t indexOf(UChar32 c) const;
1049 
1059  UChar32 charAt(int32_t index) const;
1060 
1075  virtual UnicodeSet& add(UChar32 start, UChar32 end);
1076 
1084  UnicodeSet& add(UChar32 c);
1085 
1097  UnicodeSet& add(const UnicodeString& s);
1098 
1099  private:
1105  static int32_t getSingleCP(const UnicodeString& s);
1106 
1107  void _add(const UnicodeString& s);
1108 
1109  public:
1118  UnicodeSet& addAll(const UnicodeString& s);
1119 
1128  UnicodeSet& retainAll(const UnicodeString& s);
1129 
1138  UnicodeSet& complementAll(const UnicodeString& s);
1139 
1148  UnicodeSet& removeAll(const UnicodeString& s);
1149 
1158  static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1159 
1160 
1168  static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1169 
1183  virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1184 
1185 
1191  UnicodeSet& retain(UChar32 c);
1192 
1206  virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1207 
1215  UnicodeSet& remove(UChar32 c);
1216 
1226  UnicodeSet& remove(const UnicodeString& s);
1227 
1235  virtual UnicodeSet& complement(void);
1236 
1251  virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1252 
1260  UnicodeSet& complement(UChar32 c);
1261 
1272  UnicodeSet& complement(const UnicodeString& s);
1273 
1286  virtual UnicodeSet& addAll(const UnicodeSet& c);
1287 
1299  virtual UnicodeSet& retainAll(const UnicodeSet& c);
1300 
1312  virtual UnicodeSet& removeAll(const UnicodeSet& c);
1313 
1324  virtual UnicodeSet& complementAll(const UnicodeSet& c);
1325 
1332  virtual UnicodeSet& clear(void);
1333 
1359  UnicodeSet& closeOver(int32_t attribute);
1360 
1367  virtual UnicodeSet &removeAllStrings();
1368 
1376  virtual int32_t getRangeCount(void) const;
1377 
1385  virtual UChar32 getRangeStart(int32_t index) const;
1386 
1394  virtual UChar32 getRangeEnd(int32_t index) const;
1395 
1444  int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1445 
1452  virtual UnicodeSet& compact();
1453 
1465  static UClassID U_EXPORT2 getStaticClassID(void);
1466 
1475  virtual UClassID getDynamicClassID(void) const;
1476 
1477 private:
1478 
1479  // Private API for the USet API
1480 
1481  friend class USetAccess;
1482 
1483  int32_t getStringCount() const;
1484 
1485  const UnicodeString* getString(int32_t index) const;
1486 
1487  //----------------------------------------------------------------
1488  // RuleBasedTransliterator support
1489  //----------------------------------------------------------------
1490 
1491 private:
1492 
1498  virtual UBool matchesIndexValue(uint8_t v) const;
1499 
1500 private:
1501  friend class RBBIRuleScanner;
1502 
1503  //----------------------------------------------------------------
1504  // Implementation: Clone as thawed (see ICU4J Freezable)
1505  //----------------------------------------------------------------
1506 
1507  UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1508  UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1509 
1510  //----------------------------------------------------------------
1511  // Implementation: Pattern parsing
1512  //----------------------------------------------------------------
1513 
1514  void applyPatternIgnoreSpace(const UnicodeString& pattern,
1515  ParsePosition& pos,
1516  const SymbolTable* symbols,
1517  UErrorCode& status);
1518 
1519  void applyPattern(RuleCharacterIterator& chars,
1520  const SymbolTable* symbols,
1521  UnicodeString& rebuiltPat,
1522  uint32_t options,
1523  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1524  int32_t depth,
1525  UErrorCode& ec);
1526 
1527  //----------------------------------------------------------------
1528  // Implementation: Utility methods
1529  //----------------------------------------------------------------
1530 
1531  void ensureCapacity(int32_t newLen, UErrorCode& ec);
1532 
1533  void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
1534 
1535  void swapBuffers(void);
1536 
1537  UBool allocateStrings(UErrorCode &status);
1538 
1539  UnicodeString& _toPattern(UnicodeString& result,
1540  UBool escapeUnprintable) const;
1541 
1542  UnicodeString& _generatePattern(UnicodeString& result,
1543  UBool escapeUnprintable) const;
1544 
1545  static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1546 
1547  static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1548 
1549  //----------------------------------------------------------------
1550  // Implementation: Fundamental operators
1551  //----------------------------------------------------------------
1552 
1553  void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1554 
1555  void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1556 
1557  void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1558 
1564  static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1565  int32_t pos);
1566 
1567  static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1568  int32_t iterOpts);
1569 
1609  UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1610  ParsePosition& ppos,
1611  UErrorCode &ec);
1612 
1613  void applyPropertyPattern(RuleCharacterIterator& chars,
1614  UnicodeString& rebuiltPat,
1615  UErrorCode& ec);
1616 
1617  friend class CharacterProperties;
1618  static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1619 
1624  typedef UBool (*Filter)(UChar32 codePoint, void* context);
1625 
1635  void applyFilter(Filter filter,
1636  void* context,
1637  const UnicodeSet* inclusions,
1638  UErrorCode &status);
1639 
1640 #ifndef U_HIDE_DRAFT_API // Skipped: ucpmap.h is draft only.
1641  void applyIntPropertyValue(const UCPMap *map,
1642  UCPMapValueFilter *filter, const void *context,
1643  UErrorCode &errorCode);
1644 #endif /* U_HIDE_DRAFT_API */
1645 
1649  void setPattern(const UnicodeString& newPat);
1653  void releasePattern();
1654 
1655  friend class UnicodeSetIterator;
1656 };
1657 
1658 
1659 
1660 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1661  return !operator==(o);
1662 }
1663 
1664 inline UBool UnicodeSet::isFrozen() const {
1665  return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1666 }
1667 
1668 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1669  return !containsNone(start, end);
1670 }
1671 
1673  return !containsNone(s);
1674 }
1675 
1677  return !containsNone(s);
1678 }
1679 
1680 inline UBool UnicodeSet::isBogus() const {
1681  return (UBool)(fFlags & kIsBogus);
1682 }
1683 
1685  return reinterpret_cast<UnicodeSet *>(uset);
1686 }
1687 
1688 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1689  return reinterpret_cast<const UnicodeSet *>(uset);
1690 }
1691 
1693  return reinterpret_cast<USet *>(this);
1694 }
1695 
1696 inline const USet *UnicodeSet::toUSet() const {
1697  return reinterpret_cast<const USet *>(this);
1698 }
1699 
1700 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1701  int32_t sLength=s.length();
1702  if(start<0) {
1703  start=0;
1704  } else if(start>sLength) {
1705  start=sLength;
1706  }
1707  return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1708 }
1709 
1710 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1711  int32_t sLength=s.length();
1712  if(limit<0) {
1713  limit=0;
1714  } else if(limit>sLength) {
1715  limit=sLength;
1716  }
1717  return spanBack(s.getBuffer(), limit, spanCondition);
1718 }
1719 
1721 
1722 #endif
static UClassID getStaticClassID()
ICU "poor man&#39;s RTTI", returns a UClassID for this class.
struct UCPMap UCPMap
Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
Definition: ucpmap.h:33
int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition: unimatch.h:32
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UnicodeSetIterator iterates over the contents of a UnicodeSet.
Definition: usetiter.h:63
UBool isBogus(void) const
Determine if this object contains a valid set.
Definition: uniset.h:1680
UBool operator!=(const UnicodeSet &o) const
Compares the specified object with this set for equality.
Definition: uniset.h:1660
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:90
This file defines an abstract map from Unicode code points to integer values.
static UnicodeSet * fromUSet(USet *uset)
Get a UnicodeSet pointer from a USet.
Definition: uniset.h:1684
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns TRUE if this matcher will match a character c, where c & 0xFF == v, at offset, in the forward direction (with limit > offset).
C API: Unicode Set.
An interface that defines both lookup protocol and parsing of symbolic names.
Definition: symtable.h:56
virtual UClassID getDynamicClassID(void) const =0
Returns a unique class ID polymorphically.
virtual UnicodeFunctor * clone() const =0
Return a copy of this object.
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition: rep.h:73
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:137
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition: unifilt.h:61
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const =0
Union the set of all characters that may be matched by this object into the given set...
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
uint32_t UCPMapValueFilter(const void *context, uint32_t value)
Callback function type: Modifies a map value.
Definition: ucpmap.h:116
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:389
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
UnicodeFunctor is an abstract base class for objects that perform match and/or replace operations on ...
Definition: unifunct.h:35
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
Implement UnicodeMatcher API.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:278
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:156
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:138
UProperty
Selection constants for Unicode properties.
Definition: uchar.h:195
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:401
struct USet USet
USet is the C API type corresponding to C++ class UnicodeSet.
Definition: uset.h:47
int32_t length(void) const
Return the length of the UnicodeString object.
Definition: unistr.h:3909
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition: parsepos.h:49
#define U_FINAL
Defined to the C++11 "final" keyword if available.
Definition: umachine.h:140
char16_t * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=FALSE) const =0
Returns a string representation of this matcher.
UBool containsSome(UChar32 start, UChar32 end) const
Returns true if this set contains one or more of the characters in the given range.
Definition: uniset.h:1668
virtual UBool contains(UChar32 c) const =0
Returns true for characters that are in the selected subset.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:233
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:289
UBool isFrozen() const
Determines whether the set has been frozen (made immutable) or not.
Definition: uniset.h:1664
USet * toUSet()
Produce a USet * pointer for this UnicodeSet.
Definition: uniset.h:1692
C++ API: Unicode Filter.
int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
int8_t UBool
The ICU boolean type.
Definition: umachine.h:225