ALib C++ Framework
by
Library Version: 2605 R0
Documentation generated by doxygen
Loading...
Searching...
No Matches
tokenizer.hpp
Go to the documentation of this file.
1//==================================================================================================
2/// \file
3/// This header-file is part of module \alib_strings of the \aliblong.
4///
5/// Copyright 2013-2026 A-Worx GmbH, Germany.
6/// Published under #"mainpage_license".
7//==================================================================================================
8ALIB_EXPORT namespace alib { namespace strings {
9
10/// This sub-namespace provides some utility classes which are related
11/// to string classes found in namespace #"alib::strings".
12namespace util {
13
14//==================================================================================================
15/// This class operates on strings which contains data separated by a delimiter character.
16/// It identifies the substrings between the delimiters as \e tokens of type #"^Substring".
17/// After an instance of this class is constructed, three methods are available:
18/// - #".HasNext:" Indicates if there are further tokens available.
19/// - #".Next:" Sets field #".Actual" (which is of type #"%^Substring") to reference the next token
20/// and returns it.<br>
21/// With each call to #".Next", a different delimiter can be provided, which then serves as the
22/// delimiter for this and subsequent tokens.<br>
23/// The returned token by default will be trimmed according to the current trimable characters.
24/// - #"Rest:"
25/// Like #".Next", however returns the complete remaining region without searching for
26/// further delimiters (and tokens).<br>
27/// After this method was invoked, #"HasNext()" will return \c false.
28///
29/// After a token was retrieved, it might be modified using the interface of class #"^Substring"
30/// as the tokenizer does not rely on the bounds of the current token when receiving the next.
31///
32/// Objects of this class can be reused by freshly initializing them by using method #"Set".
33/// Furthermore, even the field #".Rest" is allowed to be changed using the interface of
34/// #"%^Substring" if it seems appropriate. The effect is the same as if the method #".Set" was
35/// invoked to apply a different source string.
36///
37/// <b>Sample code</b>:<br>
38/// The following code sample shows how to tokenize a string:
39///
40/// \snippet "DOX_TOKENIZER.cpp" DOX_TOKENIZER
41///
42/// The output will be:
43///
44/// \verbinclude "DOX_TOKENIZER.txt"
45///
46/// @tparam TChar The character type. Implementations for \c nchar and \c wchar are provided
47/// with type definitions #"alib::TokenizerN" and
48/// #"alib::TokenizerW".
49//==================================================================================================
50template<typename TChar>
52 //################################################################################################
53 // Public fields
54 //################################################################################################
55 public:
56 /// A #"^Substring" that represents the part of the underlying data that has not been
57 /// tokenized, yet.
58 /// It is allowed to manipulate this public field, which has a similar effect as
59 /// using method #".Set".<br>
61
62 /// The actual token, which is returned with every invocation of #".Next" or #".Rest".
63 /// It is allowed to manipulate this field any time.<br>
65
66 /// The white spaces characters used to trim the tokens.
67 /// Defaults to #"alib::DEFAULT_WHITESPACES"
69
70
71 //################################################################################################
72 // Internal fields
73 //################################################################################################
74 protected:
75 /// The most recently set delimiter used by default for the next token extraction.
76 TChar delim;
77
78 /// If \c true, empty tokens are omitted.
80
81
82 //################################################################################################
83 // Constructors/Destructor
84 //################################################################################################
85 public:
86 /// Constructs an empty tokenizer. To initialize, method #".Set" needs to be invoked.
88
89 /// Constructs a tokenizer to work on a given string.
90 ///
91 /// @param src The string to be tokenized.
92 /// @param delimiter The delimiter that separates the tokens. Can be changed with
93 /// every next token.
94 /// @param skipEmptyTokens If \c true, empty tokens are omitted.
95 /// Optional and defaults to \c false.
96 TTokenizer( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false )
97 : Rest (src)
98 , Actual(nullptr)
99 , TrimChars( CStringConstantsTraits<TChar>::DefaultWhitespaces() )
100 , delim(delimiter)
101 , skipEmpty(skipEmptyTokens) {}
102
103 //################################################################################################
104 // Interface
105 //################################################################################################
106 public:
107 /// Resets a tokenizer to work on a given string.
108 ///
109 /// @param src The string to be tokenized
110 /// @param delimiter The delimiter that separates the tokens. Can be changed with
111 /// every next token.
112 /// @param skipEmptyTokens If \c true, empty tokens are omitted.
113 /// Optional and defaults to \c false.
114 void Set( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false ) {
115 Actual = nullptr;
116 Rest = src;
117 this->delim = delimiter;
118 this->skipEmpty = skipEmptyTokens;
119 }
120
121 /// Returns the next token, which is afterward also available through the field #".Actual".
122 /// If no further token was available, the returned #"^Substring" will be \e nulled.
123 /// (see #"TString::IsNull;String::IsNull").
124 /// To prevent this, the availability of the next token should be checked with the method
125 /// #".HasNext".
126 ///
127 /// For clarification, see the explanation and sample code in this classes documentation.
128 ///
129 /// @param trimming Determines if the token is trimmed in respect to the white space
130 /// characters defined in field #".TrimChars".
131 /// Defaults to #"Whitespaces::Trim;2".
132 /// @param newDelim The delimiter separates the tokens. Defaults to 0, which keeps the
133 /// current delimiter intact.
134 /// A new delimiter can be provided for every next token.
135 /// @return The next token as #"%^Substring". A nulled string is if no next token was
136 /// available.
139 TChar newDelim= '\0' );
140
141 /// Returns the currently remaining string (without searching for further delimiter
142 /// characters).
143 /// After this call #".HasNext" will return \c false and #".Next" will return a \e nulled
144 /// Substring.
145 /// @param trimming Determines if the token is trimmed in respect to the white space
146 /// characters defined in field #".TrimChars".
147 /// Defaults to #"Whitespaces::Trim;2".
148 /// @return The rest of the original source string, which was not returned by #".Next", yet.
150 // set start, end and end of tokenizer
151 Actual= Rest;
152 Rest = nullptr;
153 if ( trimming == lang::Whitespaces::Trim )
154 Actual.Trim( TrimChars );
155 return Actual;
156 }
157
158 /// If this returns \c true, a call to #".Next" will be successful and will return a
159 /// #"%^Substring" which is not \e nulled.
160 /// @return \c true if a next token is available.
161 bool HasNext() { return Rest.IsNotNull() && ( !skipEmpty || Rest.IsNotEmpty() ); }
162
163}; // class Tokenizer
164
165
168
169}} // namespace alib[::strings::util]
170
171/// Type alias in namespace #"%alib".
173
174/// Type alias in namespace #"%alib".
176
177/// Type alias in namespace #"%alib".
179
180
181
182} // namespace [alib]
#define ALIB_DLL
#define ALIB_EXPORT
void Set(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
TTokenizer(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
Definition tokenizer.hpp:96
TSubstring< TChar > & GetRest(lang::Whitespaces trimming=lang::Whitespaces::Trim)
TSubstring< TChar > & Next(lang::Whitespaces trimming=lang::Whitespaces::Trim, TChar newDelim='\0')
Definition tokenizer.cpp:4
TTokenizer()
Constructs an empty tokenizer. To initialize, method #".Set" needs to be invoked.
Definition tokenizer.hpp:87
TLocalString< character, 8 > TrimChars
Definition tokenizer.hpp:68
Whitespaces
Denotes whether a string is trimmed or not.
@ Trim
Trim whitespaces away.
Definition alox.cpp:14
strings::util::TTokenizer< wchar > TokenizerW
Type alias in namespace #"%alib".
strings::util::TTokenizer< character > Tokenizer
Type alias in namespace #"%alib".
characters::wchar wchar
Type alias in namespace #"%alib".
characters::nchar nchar
Type alias in namespace #"%alib".
strings::util::TTokenizer< nchar > TokenizerN
Type alias in namespace #"%alib".