string.cpp

Go to the documentation of this file.
00001 /* $Id: string.cpp 26124 2013-11-26 13:42:09Z rubidium $ */
00002 
00003 /*
00004  * This file is part of OpenTTD.
00005  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
00006  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00007  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
00008  */
00009 
00012 #include "stdafx.h"
00013 #include "debug.h"
00014 #include "core/alloc_func.hpp"
00015 #include "core/math_func.hpp"
00016 #include "string_func.h"
00017 #include "string_base.h"
00018 
00019 #include "table/control_codes.h"
00020 
00021 #include <stdarg.h>
00022 #include <ctype.h> /* required for tolower() */
00023 
00024 #ifdef _MSC_VER
00025 #include <errno.h> // required by vsnprintf implementation for MSVC
00026 #endif
00027 
00028 #ifdef WITH_ICU
00029 /* Required by strnatcmp. */
00030 #include <unicode/ustring.h>
00031 #include "language.h"
00032 #include "gfx_func.h"
00033 #endif /* WITH_ICU */
00034 
00045 static int CDECL vseprintf(char *str, const char *last, const char *format, va_list ap)
00046 {
00047   ptrdiff_t diff = last - str;
00048   if (diff < 0) return 0;
00049   return min((int)diff, vsnprintf(str, diff + 1, format, ap));
00050 }
00051 
00066 void ttd_strlcat(char *dst, const char *src, size_t size)
00067 {
00068   assert(size > 0);
00069   while (size > 0 && *dst != '\0') {
00070     size--;
00071     dst++;
00072   }
00073 
00074   ttd_strlcpy(dst, src, size);
00075 }
00076 
00077 
00092 void ttd_strlcpy(char *dst, const char *src, size_t size)
00093 {
00094   assert(size > 0);
00095   while (--size > 0 && *src != '\0') {
00096     *dst++ = *src++;
00097   }
00098   *dst = '\0';
00099 }
00100 
00101 
00118 char *strecat(char *dst, const char *src, const char *last)
00119 {
00120   assert(dst <= last);
00121   while (*dst != '\0') {
00122     if (dst == last) return dst;
00123     dst++;
00124   }
00125 
00126   return strecpy(dst, src, last);
00127 }
00128 
00129 
00146 char *strecpy(char *dst, const char *src, const char *last)
00147 {
00148   assert(dst <= last);
00149   while (dst != last && *src != '\0') {
00150     *dst++ = *src++;
00151   }
00152   *dst = '\0';
00153 
00154   if (dst == last && *src != '\0') {
00155 #if defined(STRGEN) || defined(SETTINGSGEN)
00156     error("String too long for destination buffer");
00157 #else /* STRGEN || SETTINGSGEN */
00158     DEBUG(misc, 0, "String too long for destination buffer");
00159 #endif /* STRGEN || SETTINGSGEN */
00160   }
00161   return dst;
00162 }
00163 
00169 char *CDECL str_fmt(const char *str, ...)
00170 {
00171   char buf[4096];
00172   va_list va;
00173 
00174   va_start(va, str);
00175   int len = vseprintf(buf, lastof(buf), str, va);
00176   va_end(va);
00177   char *p = MallocT<char>(len + 1);
00178   memcpy(p, buf, len + 1);
00179   return p;
00180 }
00181 
00188 void str_fix_scc_encoded(char *str, const char *last)
00189 {
00190   while (str <= last && *str != '\0') {
00191     size_t len = Utf8EncodedCharLen(*str);
00192     if ((len == 0 && str + 4 > last) || str + len > last) break;
00193 
00194     WChar c;
00195     len = Utf8Decode(&c, str);
00196     if (c == '\0') break;
00197 
00198     if (c == 0xE028 || c == 0xE02A) {
00199       c = SCC_ENCODED;
00200     }
00201     str += Utf8Encode(str, c);
00202   }
00203   *str = '\0';
00204 }
00205 
00206 
00214 void str_validate(char *str, const char *last, StringValidationSettings settings)
00215 {
00216   /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
00217 
00218   char *dst = str;
00219   while (str <= last && *str != '\0') {
00220     size_t len = Utf8EncodedCharLen(*str);
00221     /* If the character is unknown, i.e. encoded length is 0
00222      * we assume worst case for the length check.
00223      * The length check is needed to prevent Utf8Decode to read
00224      * over the terminating '\0' if that happens to be placed
00225      * within the encoding of an UTF8 character. */
00226     if ((len == 0 && str + 4 > last) || str + len > last) break;
00227 
00228     WChar c;
00229     len = Utf8Decode(&c, str);
00230     /* It's possible to encode the string termination character
00231      * into a multiple bytes. This prevents those termination
00232      * characters to be skipped */
00233     if (c == '\0') break;
00234 
00235     if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
00236       /* Copy the character back. Even if dst is current the same as str
00237        * (i.e. no characters have been changed) this is quicker than
00238        * moving the pointers ahead by len */
00239       do {
00240         *dst++ = *str++;
00241       } while (--len != 0);
00242     } else if ((settings & SVS_ALLOW_NEWLINE) != 0  && c == '\n') {
00243       *dst++ = *str++;
00244     } else {
00245       if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\r' && str[1] == '\n') {
00246         str += len;
00247         continue;
00248       }
00249       /* Replace the undesirable character with a question mark */
00250       str += len;
00251       if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) *dst++ = '?';
00252     }
00253   }
00254 
00255   *dst = '\0';
00256 }
00257 
00263 void ValidateString(const char *str)
00264 {
00265   /* We know it is '\0' terminated. */
00266   str_validate(const_cast<char *>(str), str + strlen(str) + 1);
00267 }
00268 
00269 
00277 bool StrValid(const char *str, const char *last)
00278 {
00279   /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
00280 
00281   while (str <= last && *str != '\0') {
00282     size_t len = Utf8EncodedCharLen(*str);
00283     /* Encoded length is 0 if the character isn't known.
00284      * The length check is needed to prevent Utf8Decode to read
00285      * over the terminating '\0' if that happens to be placed
00286      * within the encoding of an UTF8 character. */
00287     if (len == 0 || str + len > last) return false;
00288 
00289     WChar c;
00290     len = Utf8Decode(&c, str);
00291     if (!IsPrintable(c) || (c >= SCC_SPRITE_START && c <= SCC_SPRITE_END)) {
00292       return false;
00293     }
00294 
00295     str += len;
00296   }
00297 
00298   return *str == '\0';
00299 }
00300 
00302 void str_strip_colours(char *str)
00303 {
00304   char *dst = str;
00305   WChar c;
00306   size_t len;
00307 
00308   for (len = Utf8Decode(&c, str); c != '\0'; len = Utf8Decode(&c, str)) {
00309     if (c < SCC_BLUE || c > SCC_BLACK) {
00310       /* Copy the character back. Even if dst is current the same as str
00311        * (i.e. no characters have been changed) this is quicker than
00312        * moving the pointers ahead by len */
00313       do {
00314         *dst++ = *str++;
00315       } while (--len != 0);
00316     } else {
00317       /* Just skip (strip) the colour codes */
00318       str += len;
00319     }
00320   }
00321   *dst = '\0';
00322 }
00323 
00330 size_t Utf8StringLength(const char *s)
00331 {
00332   size_t len = 0;
00333   const char *t = s;
00334   while (Utf8Consume(&t) != 0) len++;
00335   return len;
00336 }
00337 
00338 
00350 bool strtolower(char *str)
00351 {
00352   bool changed = false;
00353   for (; *str != '\0'; str++) {
00354     char new_str = tolower(*str);
00355     changed |= new_str != *str;
00356     *str = new_str;
00357   }
00358   return changed;
00359 }
00360 
00368 bool IsValidChar(WChar key, CharSetFilter afilter)
00369 {
00370   switch (afilter) {
00371     case CS_ALPHANUMERAL:  return IsPrintable(key);
00372     case CS_NUMERAL:       return (key >= '0' && key <= '9');
00373     case CS_NUMERAL_SPACE: return (key >= '0' && key <= '9') || key == ' ';
00374     case CS_ALPHA:         return IsPrintable(key) && !(key >= '0' && key <= '9');
00375     case CS_HEXADECIMAL:   return (key >= '0' && key <= '9') || (key >= 'a' && key <= 'f') || (key >= 'A' && key <= 'F');
00376   }
00377 
00378   return false;
00379 }
00380 
00381 #ifdef WIN32
00382 /* Since version 3.14, MinGW Runtime has snprintf() and vsnprintf() conform to C99 but it's not the case for older versions */
00383 #if (__MINGW32_MAJOR_VERSION < 3) || ((__MINGW32_MAJOR_VERSION == 3) && (__MINGW32_MINOR_VERSION < 14))
00384 int CDECL snprintf(char *str, size_t size, const char *format, ...)
00385 {
00386   va_list ap;
00387   int ret;
00388 
00389   va_start(ap, format);
00390   ret = vsnprintf(str, size, format, ap);
00391   va_end(ap);
00392   return ret;
00393 }
00394 #endif /* MinGW Runtime < 3.14 */
00395 
00396 #ifdef _MSC_VER
00397 
00404 int CDECL vsnprintf(char *str, size_t size, const char *format, va_list ap)
00405 {
00406   if (size == 0) return 0;
00407 
00408   errno = 0;
00409   int ret = _vsnprintf(str, size, format, ap);
00410 
00411   if (ret < 0) {
00412     if (errno != ERANGE) {
00413       /* There's a formatting error, better get that looked
00414        * at properly instead of ignoring it. */
00415       NOT_REACHED();
00416     }
00417   } else if ((size_t)ret < size) {
00418     /* The buffer is big enough for the number of
00419      * characters stored (excluding null), i.e.
00420      * the string has been null-terminated. */
00421     return ret;
00422   }
00423 
00424   /* The buffer is too small for _vsnprintf to write the
00425    * null-terminator at its end and return size. */
00426   str[size - 1] = '\0';
00427   return (int)size;
00428 }
00429 #endif /* _MSC_VER */
00430 
00431 #endif /* WIN32 */
00432 
00442 int CDECL seprintf(char *str, const char *last, const char *format, ...)
00443 {
00444   va_list ap;
00445 
00446   va_start(ap, format);
00447   int ret = vseprintf(str, last, format, ap);
00448   va_end(ap);
00449   return ret;
00450 }
00451 
00452 
00460 char *md5sumToString(char *buf, const char *last, const uint8 md5sum[16])
00461 {
00462   char *p = buf;
00463 
00464   for (uint i = 0; i < 16; i++) {
00465     p += seprintf(p, last, "%02X", md5sum[i]);
00466   }
00467 
00468   return p;
00469 }
00470 
00471 
00472 /* UTF-8 handling routines */
00473 
00474 
00481 size_t Utf8Decode(WChar *c, const char *s)
00482 {
00483   assert(c != NULL);
00484 
00485   if (!HasBit(s[0], 7)) {
00486     /* Single byte character: 0xxxxxxx */
00487     *c = s[0];
00488     return 1;
00489   } else if (GB(s[0], 5, 3) == 6) {
00490     if (IsUtf8Part(s[1])) {
00491       /* Double byte character: 110xxxxx 10xxxxxx */
00492       *c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
00493       if (*c >= 0x80) return 2;
00494     }
00495   } else if (GB(s[0], 4, 4) == 14) {
00496     if (IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
00497       /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
00498       *c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
00499       if (*c >= 0x800) return 3;
00500     }
00501   } else if (GB(s[0], 3, 5) == 30) {
00502     if (IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
00503       /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
00504       *c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
00505       if (*c >= 0x10000 && *c <= 0x10FFFF) return 4;
00506     }
00507   }
00508 
00509   /* DEBUG(misc, 1, "[utf8] invalid UTF-8 sequence"); */
00510   *c = '?';
00511   return 1;
00512 }
00513 
00514 
00521 size_t Utf8Encode(char *buf, WChar c)
00522 {
00523   if (c < 0x80) {
00524     *buf = c;
00525     return 1;
00526   } else if (c < 0x800) {
00527     *buf++ = 0xC0 + GB(c,  6, 5);
00528     *buf   = 0x80 + GB(c,  0, 6);
00529     return 2;
00530   } else if (c < 0x10000) {
00531     *buf++ = 0xE0 + GB(c, 12, 4);
00532     *buf++ = 0x80 + GB(c,  6, 6);
00533     *buf   = 0x80 + GB(c,  0, 6);
00534     return 3;
00535   } else if (c < 0x110000) {
00536     *buf++ = 0xF0 + GB(c, 18, 3);
00537     *buf++ = 0x80 + GB(c, 12, 6);
00538     *buf++ = 0x80 + GB(c,  6, 6);
00539     *buf   = 0x80 + GB(c,  0, 6);
00540     return 4;
00541   }
00542 
00543   /* DEBUG(misc, 1, "[utf8] can't UTF-8 encode value 0x%X", c); */
00544   *buf = '?';
00545   return 1;
00546 }
00547 
00555 size_t Utf8TrimString(char *s, size_t maxlen)
00556 {
00557   size_t length = 0;
00558 
00559   for (const char *ptr = strchr(s, '\0'); *s != '\0';) {
00560     size_t len = Utf8EncodedCharLen(*s);
00561     /* Silently ignore invalid UTF8 sequences, our only concern trimming */
00562     if (len == 0) len = 1;
00563 
00564     /* Take care when a hard cutoff was made for the string and
00565      * the last UTF8 sequence is invalid */
00566     if (length + len >= maxlen || (s + len > ptr)) break;
00567     s += len;
00568     length += len;
00569   }
00570 
00571   *s = '\0';
00572   return length;
00573 }
00574 
00575 #ifdef DEFINE_STRNDUP
00576 char *strndup(const char *s, size_t len)
00577 {
00578   len = ttd_strnlen(s, len);
00579   char *tmp = CallocT<char>(len + 1);
00580   memcpy(tmp, s, len);
00581   return tmp;
00582 }
00583 #endif /* DEFINE_STRNDUP */
00584 
00585 #ifdef DEFINE_STRCASESTR
00586 char *strcasestr(const char *haystack, const char *needle)
00587 {
00588   size_t hay_len = strlen(haystack);
00589   size_t needle_len = strlen(needle);
00590   while (hay_len >= needle_len) {
00591     if (strncasecmp(haystack, needle, needle_len) == 0) return const_cast<char *>(haystack);
00592 
00593     haystack++;
00594     hay_len--;
00595   }
00596 
00597   return NULL;
00598 }
00599 #endif /* DEFINE_STRCASESTR */
00600 
00609 static const char *SkipGarbage(const char *str)
00610 {
00611   while (*str != '\0' && (*str < '0' || IsInsideMM(*str, ';', '@' + 1) || IsInsideMM(*str, '[', '`' + 1) || IsInsideMM(*str, '{', '~' + 1))) str++;
00612   return str;
00613 }
00614 
00623 int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
00624 {
00625   if (ignore_garbage_at_front) {
00626     s1 = SkipGarbage(s1);
00627     s2 = SkipGarbage(s2);
00628   }
00629 #ifdef WITH_ICU
00630   if (_current_collator != NULL) {
00631     UErrorCode status = U_ZERO_ERROR;
00632     int result;
00633 
00634     /* We want to use the new faster method for ICU 4.2 and higher. */
00635 #if U_ICU_VERSION_MAJOR_NUM > 4 || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 2)
00636     /* The StringPiece parameter gets implicitly constructed from the char *. */
00637     result = _current_collator->compareUTF8(s1, s2, status);
00638 #else /* The following for 4.0 and lower. */
00639     UChar buffer1[DRAW_STRING_BUFFER];
00640     u_strFromUTF8Lenient(buffer1, lengthof(buffer1), NULL, s1, -1, &status);
00641     UChar buffer2[DRAW_STRING_BUFFER];
00642     u_strFromUTF8Lenient(buffer2, lengthof(buffer2), NULL, s2, -1, &status);
00643 
00644     result = _current_collator->compare(buffer1, buffer2, status);
00645 #endif /* ICU version check. */
00646     if (U_SUCCESS(status)) return result;
00647   }
00648 
00649 #endif /* WITH_ICU */
00650 
00651   /* Do a normal comparison if ICU is missing or if we cannot create a collator. */
00652   return strcasecmp(s1, s2);
00653 }
00654 
00655 #ifdef WITH_ICU
00656 
00657 #include <unicode/utext.h>
00658 #include <unicode/brkiter.h>
00659 
00661 class IcuStringIterator : public StringIterator
00662 {
00663   icu::BreakIterator *char_itr; 
00664   icu::BreakIterator *word_itr; 
00665   const char *string;           
00666 
00667   SmallVector<UChar, 32> utf16_str;      
00668   SmallVector<size_t, 32> utf16_to_utf8; 
00669 
00670 public:
00671   IcuStringIterator() : char_itr(NULL), word_itr(NULL)
00672   {
00673     UErrorCode status = U_ZERO_ERROR;
00674     this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
00675     this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
00676 
00677     *this->utf16_str.Append() = '\0';
00678     *this->utf16_to_utf8.Append() = 0;
00679   }
00680 
00681   virtual ~IcuStringIterator()
00682   {
00683     delete this->char_itr;
00684     delete this->word_itr;
00685   }
00686 
00687   virtual void SetString(const char *s)
00688   {
00689     this->string = s;
00690 
00691     /* Unfortunately current ICU versions only provide rudimentary support
00692      * for word break iterators (especially for CJK languages) in combination
00693      * with UTF-8 input. As a work around we have to convert the input to
00694      * UTF-16 and create a mapping back to UTF-8 character indices. */
00695     this->utf16_str.Clear();
00696     this->utf16_to_utf8.Clear();
00697 
00698     while (*s != '\0') {
00699       size_t idx = s - this->string;
00700 
00701       WChar c = Utf8Consume(&s);
00702       if (c < 0x10000) {
00703         *this->utf16_str.Append() = (UChar)c;
00704       } else {
00705         /* Make a surrogate pair. */
00706         *this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10));
00707         *this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF));
00708         *this->utf16_to_utf8.Append() = idx;
00709       }
00710       *this->utf16_to_utf8.Append() = idx;
00711     }
00712     *this->utf16_str.Append() = '\0';
00713     *this->utf16_to_utf8.Append() = s - this->string;
00714 
00715     UText text = UTEXT_INITIALIZER;
00716     UErrorCode status = U_ZERO_ERROR;
00717     utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status);
00718     this->char_itr->setText(&text, status);
00719     this->word_itr->setText(&text, status);
00720     this->char_itr->first();
00721     this->word_itr->first();
00722   }
00723 
00724   virtual size_t SetCurPosition(size_t pos)
00725   {
00726     /* Convert incoming position to an UTF-16 string index. */
00727     uint utf16_pos = 0;
00728     for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) {
00729       if (this->utf16_to_utf8[i] == pos) {
00730         utf16_pos = i;
00731         break;
00732       }
00733     }
00734 
00735     /* isBoundary has the documented side-effect of setting the current
00736      * position to the first valid boundary equal to or greater than
00737      * the passed value. */
00738     this->char_itr->isBoundary(utf16_pos);
00739     return this->utf16_to_utf8[this->char_itr->current()];
00740   }
00741 
00742   virtual size_t Next(IterType what)
00743   {
00744     int32_t pos;
00745     switch (what) {
00746       case ITER_CHARACTER:
00747         pos = this->char_itr->next();
00748         break;
00749 
00750       case ITER_WORD:
00751         pos = this->word_itr->following(this->char_itr->current());
00752         /* The ICU word iterator considers both the start and the end of a word a valid
00753          * break point, but we only want word starts. Move to the next location in
00754          * case the new position points to whitespace. */
00755         while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next();
00756 
00757         this->char_itr->isBoundary(pos);
00758         break;
00759 
00760       default:
00761         NOT_REACHED();
00762     }
00763 
00764     return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
00765   }
00766 
00767   virtual size_t Prev(IterType what)
00768   {
00769     int32_t pos;
00770     switch (what) {
00771       case ITER_CHARACTER:
00772         pos = this->char_itr->previous();
00773         break;
00774 
00775       case ITER_WORD:
00776         pos = this->word_itr->preceding(this->char_itr->current());
00777         /* The ICU word iterator considers both the start and the end of a word a valid
00778          * break point, but we only want word starts. Move to the previous location in
00779          * case the new position points to whitespace. */
00780         while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous();
00781 
00782         this->char_itr->isBoundary(pos);
00783         break;
00784 
00785       default:
00786         NOT_REACHED();
00787     }
00788 
00789     return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
00790   }
00791 };
00792 
00793 /* static */ StringIterator *StringIterator::Create()
00794 {
00795   return new IcuStringIterator();
00796 }
00797 
00798 #else
00799 
00801 class DefaultStringIterator : public StringIterator
00802 {
00803   const char *string; 
00804   size_t len;         
00805   size_t cur_pos;     
00806 
00807 public:
00808   DefaultStringIterator() : string(NULL), len(0), cur_pos(0)
00809   {
00810   }
00811 
00812   virtual void SetString(const char *s)
00813   {
00814     this->string = s;
00815     this->len = strlen(s);
00816     this->cur_pos = 0;
00817   }
00818 
00819   virtual size_t SetCurPosition(size_t pos)
00820   {
00821     assert(this->string != NULL && pos <= this->len);
00822     /* Sanitize in case we get a position inside an UTF-8 sequence. */
00823     while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
00824     return this->cur_pos = pos;
00825   }
00826 
00827   virtual size_t Next(IterType what)
00828   {
00829     assert(this->string != NULL);
00830 
00831     /* Already at the end? */
00832     if (this->cur_pos >= this->len) return END;
00833 
00834     switch (what) {
00835       case ITER_CHARACTER: {
00836         WChar c;
00837         this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
00838         return this->cur_pos;
00839       }
00840 
00841       case ITER_WORD: {
00842         WChar c;
00843         /* Consume current word. */
00844         size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
00845         while (this->cur_pos < this->len && !IsWhitespace(c)) {
00846           this->cur_pos += offs;
00847           offs = Utf8Decode(&c, this->string + this->cur_pos);
00848         }
00849         /* Consume whitespace to the next word. */
00850         while (this->cur_pos < this->len && IsWhitespace(c)) {
00851           this->cur_pos += offs;
00852           offs = Utf8Decode(&c, this->string + this->cur_pos);
00853         }
00854 
00855         return this->cur_pos;
00856       }
00857 
00858       default:
00859         NOT_REACHED();
00860     }
00861 
00862     return END;
00863   }
00864 
00865   virtual size_t Prev(IterType what)
00866   {
00867     assert(this->string != NULL);
00868 
00869     /* Already at the beginning? */
00870     if (this->cur_pos == 0) return END;
00871 
00872     switch (what) {
00873       case ITER_CHARACTER:
00874         return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
00875 
00876       case ITER_WORD: {
00877         const char *s = this->string + this->cur_pos;
00878         WChar c;
00879         /* Consume preceding whitespace. */
00880         do {
00881           s = Utf8PrevChar(s);
00882           Utf8Decode(&c, s);
00883         } while (s > this->string && IsWhitespace(c));
00884         /* Consume preceding word. */
00885         while (s > this->string && !IsWhitespace(c)) {
00886           s = Utf8PrevChar(s);
00887           Utf8Decode(&c, s);
00888         }
00889         /* Move caret back to the beginning of the word. */
00890         if (IsWhitespace(c)) Utf8Consume(&s);
00891 
00892         return this->cur_pos = s - this->string;
00893       }
00894 
00895       default:
00896         NOT_REACHED();
00897     }
00898 
00899     return END;
00900   }
00901 };
00902 
00903 /* static */ StringIterator *StringIterator::Create()
00904 {
00905   return new DefaultStringIterator();
00906 }
00907 
00908 #endif