00001
00002
00003
00004
00005
00006
00007
00008
00009
00012 #include "stdafx.h"
00013 #include "debug.h"
00014 #include "core/alloc_func.hpp"
00015 #include "core/math_func.hpp"
00016 #include "string_func.h"
00017 #include "string_base.h"
00018
00019 #include "table/control_codes.h"
00020
00021 #include <stdarg.h>
00022 #include <ctype.h>
00023
00024 #ifdef _MSC_VER
00025 #include <errno.h>
00026 #endif
00027
00028 #ifdef WITH_ICU
00029
00030 #include <unicode/ustring.h>
00031 #include "language.h"
00032 #include "gfx_func.h"
00033 #endif
00034
00045 static int CDECL vseprintf(char *str, const char *last, const char *format, va_list ap)
00046 {
00047 ptrdiff_t diff = last - str;
00048 if (diff < 0) return 0;
00049 return min((int)diff, vsnprintf(str, diff + 1, format, ap));
00050 }
00051
00066 void ttd_strlcat(char *dst, const char *src, size_t size)
00067 {
00068 assert(size > 0);
00069 while (size > 0 && *dst != '\0') {
00070 size--;
00071 dst++;
00072 }
00073
00074 ttd_strlcpy(dst, src, size);
00075 }
00076
00077
00092 void ttd_strlcpy(char *dst, const char *src, size_t size)
00093 {
00094 assert(size > 0);
00095 while (--size > 0 && *src != '\0') {
00096 *dst++ = *src++;
00097 }
00098 *dst = '\0';
00099 }
00100
00101
00118 char *strecat(char *dst, const char *src, const char *last)
00119 {
00120 assert(dst <= last);
00121 while (*dst != '\0') {
00122 if (dst == last) return dst;
00123 dst++;
00124 }
00125
00126 return strecpy(dst, src, last);
00127 }
00128
00129
00146 char *strecpy(char *dst, const char *src, const char *last)
00147 {
00148 assert(dst <= last);
00149 while (dst != last && *src != '\0') {
00150 *dst++ = *src++;
00151 }
00152 *dst = '\0';
00153
00154 if (dst == last && *src != '\0') {
00155 #if defined(STRGEN) || defined(SETTINGSGEN)
00156 error("String too long for destination buffer");
00157 #else
00158 DEBUG(misc, 0, "String too long for destination buffer");
00159 #endif
00160 }
00161 return dst;
00162 }
00163
00169 char *CDECL str_fmt(const char *str, ...)
00170 {
00171 char buf[4096];
00172 va_list va;
00173
00174 va_start(va, str);
00175 int len = vseprintf(buf, lastof(buf), str, va);
00176 va_end(va);
00177 char *p = MallocT<char>(len + 1);
00178 memcpy(p, buf, len + 1);
00179 return p;
00180 }
00181
00188 void str_fix_scc_encoded(char *str, const char *last)
00189 {
00190 while (str <= last && *str != '\0') {
00191 size_t len = Utf8EncodedCharLen(*str);
00192 if ((len == 0 && str + 4 > last) || str + len > last) break;
00193
00194 WChar c;
00195 len = Utf8Decode(&c, str);
00196 if (c == '\0') break;
00197
00198 if (c == 0xE028 || c == 0xE02A) {
00199 c = SCC_ENCODED;
00200 }
00201 str += Utf8Encode(str, c);
00202 }
00203 *str = '\0';
00204 }
00205
00206
00214 void str_validate(char *str, const char *last, StringValidationSettings settings)
00215 {
00216
00217
00218 char *dst = str;
00219 while (str <= last && *str != '\0') {
00220 size_t len = Utf8EncodedCharLen(*str);
00221
00222
00223
00224
00225
00226 if ((len == 0 && str + 4 > last) || str + len > last) break;
00227
00228 WChar c;
00229 len = Utf8Decode(&c, str);
00230
00231
00232
00233 if (c == '\0') break;
00234
00235 if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
00236
00237
00238
00239 do {
00240 *dst++ = *str++;
00241 } while (--len != 0);
00242 } else if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\n') {
00243 *dst++ = *str++;
00244 } else {
00245 if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\r' && str[1] == '\n') {
00246 str += len;
00247 continue;
00248 }
00249
00250 str += len;
00251 if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) *dst++ = '?';
00252 }
00253 }
00254
00255 *dst = '\0';
00256 }
00257
00263 void ValidateString(const char *str)
00264 {
00265
00266 str_validate(const_cast<char *>(str), str + strlen(str) + 1);
00267 }
00268
00269
00277 bool StrValid(const char *str, const char *last)
00278 {
00279
00280
00281 while (str <= last && *str != '\0') {
00282 size_t len = Utf8EncodedCharLen(*str);
00283
00284
00285
00286
00287 if (len == 0 || str + len > last) return false;
00288
00289 WChar c;
00290 len = Utf8Decode(&c, str);
00291 if (!IsPrintable(c) || (c >= SCC_SPRITE_START && c <= SCC_SPRITE_END)) {
00292 return false;
00293 }
00294
00295 str += len;
00296 }
00297
00298 return *str == '\0';
00299 }
00300
00302 void str_strip_colours(char *str)
00303 {
00304 char *dst = str;
00305 WChar c;
00306 size_t len;
00307
00308 for (len = Utf8Decode(&c, str); c != '\0'; len = Utf8Decode(&c, str)) {
00309 if (c < SCC_BLUE || c > SCC_BLACK) {
00310
00311
00312
00313 do {
00314 *dst++ = *str++;
00315 } while (--len != 0);
00316 } else {
00317
00318 str += len;
00319 }
00320 }
00321 *dst = '\0';
00322 }
00323
00330 size_t Utf8StringLength(const char *s)
00331 {
00332 size_t len = 0;
00333 const char *t = s;
00334 while (Utf8Consume(&t) != 0) len++;
00335 return len;
00336 }
00337
00338
00350 bool strtolower(char *str)
00351 {
00352 bool changed = false;
00353 for (; *str != '\0'; str++) {
00354 char new_str = tolower(*str);
00355 changed |= new_str != *str;
00356 *str = new_str;
00357 }
00358 return changed;
00359 }
00360
00368 bool IsValidChar(WChar key, CharSetFilter afilter)
00369 {
00370 switch (afilter) {
00371 case CS_ALPHANUMERAL: return IsPrintable(key);
00372 case CS_NUMERAL: return (key >= '0' && key <= '9');
00373 case CS_NUMERAL_SPACE: return (key >= '0' && key <= '9') || key == ' ';
00374 case CS_ALPHA: return IsPrintable(key) && !(key >= '0' && key <= '9');
00375 case CS_HEXADECIMAL: return (key >= '0' && key <= '9') || (key >= 'a' && key <= 'f') || (key >= 'A' && key <= 'F');
00376 }
00377
00378 return false;
00379 }
00380
00381 #ifdef WIN32
00382
00383 #if (__MINGW32_MAJOR_VERSION < 3) || ((__MINGW32_MAJOR_VERSION == 3) && (__MINGW32_MINOR_VERSION < 14))
00384 int CDECL snprintf(char *str, size_t size, const char *format, ...)
00385 {
00386 va_list ap;
00387 int ret;
00388
00389 va_start(ap, format);
00390 ret = vsnprintf(str, size, format, ap);
00391 va_end(ap);
00392 return ret;
00393 }
00394 #endif
00395
00396 #ifdef _MSC_VER
00397
00404 int CDECL vsnprintf(char *str, size_t size, const char *format, va_list ap)
00405 {
00406 if (size == 0) return 0;
00407
00408 errno = 0;
00409 int ret = _vsnprintf(str, size, format, ap);
00410
00411 if (ret < 0) {
00412 if (errno != ERANGE) {
00413
00414
00415 NOT_REACHED();
00416 }
00417 } else if ((size_t)ret < size) {
00418
00419
00420
00421 return ret;
00422 }
00423
00424
00425
00426 str[size - 1] = '\0';
00427 return (int)size;
00428 }
00429 #endif
00430
00431 #endif
00432
00442 int CDECL seprintf(char *str, const char *last, const char *format, ...)
00443 {
00444 va_list ap;
00445
00446 va_start(ap, format);
00447 int ret = vseprintf(str, last, format, ap);
00448 va_end(ap);
00449 return ret;
00450 }
00451
00452
00460 char *md5sumToString(char *buf, const char *last, const uint8 md5sum[16])
00461 {
00462 char *p = buf;
00463
00464 for (uint i = 0; i < 16; i++) {
00465 p += seprintf(p, last, "%02X", md5sum[i]);
00466 }
00467
00468 return p;
00469 }
00470
00471
00472
00473
00474
00481 size_t Utf8Decode(WChar *c, const char *s)
00482 {
00483 assert(c != NULL);
00484
00485 if (!HasBit(s[0], 7)) {
00486
00487 *c = s[0];
00488 return 1;
00489 } else if (GB(s[0], 5, 3) == 6) {
00490 if (IsUtf8Part(s[1])) {
00491
00492 *c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
00493 if (*c >= 0x80) return 2;
00494 }
00495 } else if (GB(s[0], 4, 4) == 14) {
00496 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
00497
00498 *c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
00499 if (*c >= 0x800) return 3;
00500 }
00501 } else if (GB(s[0], 3, 5) == 30) {
00502 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
00503
00504 *c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
00505 if (*c >= 0x10000 && *c <= 0x10FFFF) return 4;
00506 }
00507 }
00508
00509
00510 *c = '?';
00511 return 1;
00512 }
00513
00514
00521 size_t Utf8Encode(char *buf, WChar c)
00522 {
00523 if (c < 0x80) {
00524 *buf = c;
00525 return 1;
00526 } else if (c < 0x800) {
00527 *buf++ = 0xC0 + GB(c, 6, 5);
00528 *buf = 0x80 + GB(c, 0, 6);
00529 return 2;
00530 } else if (c < 0x10000) {
00531 *buf++ = 0xE0 + GB(c, 12, 4);
00532 *buf++ = 0x80 + GB(c, 6, 6);
00533 *buf = 0x80 + GB(c, 0, 6);
00534 return 3;
00535 } else if (c < 0x110000) {
00536 *buf++ = 0xF0 + GB(c, 18, 3);
00537 *buf++ = 0x80 + GB(c, 12, 6);
00538 *buf++ = 0x80 + GB(c, 6, 6);
00539 *buf = 0x80 + GB(c, 0, 6);
00540 return 4;
00541 }
00542
00543
00544 *buf = '?';
00545 return 1;
00546 }
00547
00555 size_t Utf8TrimString(char *s, size_t maxlen)
00556 {
00557 size_t length = 0;
00558
00559 for (const char *ptr = strchr(s, '\0'); *s != '\0';) {
00560 size_t len = Utf8EncodedCharLen(*s);
00561
00562 if (len == 0) len = 1;
00563
00564
00565
00566 if (length + len >= maxlen || (s + len > ptr)) break;
00567 s += len;
00568 length += len;
00569 }
00570
00571 *s = '\0';
00572 return length;
00573 }
00574
00575 #ifdef DEFINE_STRNDUP
00576 char *strndup(const char *s, size_t len)
00577 {
00578 len = ttd_strnlen(s, len);
00579 char *tmp = CallocT<char>(len + 1);
00580 memcpy(tmp, s, len);
00581 return tmp;
00582 }
00583 #endif
00584
00585 #ifdef DEFINE_STRCASESTR
00586 char *strcasestr(const char *haystack, const char *needle)
00587 {
00588 size_t hay_len = strlen(haystack);
00589 size_t needle_len = strlen(needle);
00590 while (hay_len >= needle_len) {
00591 if (strncasecmp(haystack, needle, needle_len) == 0) return const_cast<char *>(haystack);
00592
00593 haystack++;
00594 hay_len--;
00595 }
00596
00597 return NULL;
00598 }
00599 #endif
00600
00609 static const char *SkipGarbage(const char *str)
00610 {
00611 while (*str != '\0' && (*str < '0' || IsInsideMM(*str, ';', '@' + 1) || IsInsideMM(*str, '[', '`' + 1) || IsInsideMM(*str, '{', '~' + 1))) str++;
00612 return str;
00613 }
00614
00623 int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
00624 {
00625 if (ignore_garbage_at_front) {
00626 s1 = SkipGarbage(s1);
00627 s2 = SkipGarbage(s2);
00628 }
00629 #ifdef WITH_ICU
00630 if (_current_collator != NULL) {
00631 UErrorCode status = U_ZERO_ERROR;
00632 int result;
00633
00634
00635 #if U_ICU_VERSION_MAJOR_NUM > 4 || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 2)
00636
00637 result = _current_collator->compareUTF8(s1, s2, status);
00638 #else
00639 UChar buffer1[DRAW_STRING_BUFFER];
00640 u_strFromUTF8Lenient(buffer1, lengthof(buffer1), NULL, s1, -1, &status);
00641 UChar buffer2[DRAW_STRING_BUFFER];
00642 u_strFromUTF8Lenient(buffer2, lengthof(buffer2), NULL, s2, -1, &status);
00643
00644 result = _current_collator->compare(buffer1, buffer2, status);
00645 #endif
00646 if (U_SUCCESS(status)) return result;
00647 }
00648
00649 #endif
00650
00651
00652 return strcasecmp(s1, s2);
00653 }
00654
00655 #ifdef WITH_ICU
00656
00657 #include <unicode/utext.h>
00658 #include <unicode/brkiter.h>
00659
00661 class IcuStringIterator : public StringIterator
00662 {
00663 icu::BreakIterator *char_itr;
00664 icu::BreakIterator *word_itr;
00665 const char *string;
00666
00667 SmallVector<UChar, 32> utf16_str;
00668 SmallVector<size_t, 32> utf16_to_utf8;
00669
00670 public:
00671 IcuStringIterator() : char_itr(NULL), word_itr(NULL)
00672 {
00673 UErrorCode status = U_ZERO_ERROR;
00674 this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
00675 this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
00676
00677 *this->utf16_str.Append() = '\0';
00678 *this->utf16_to_utf8.Append() = 0;
00679 }
00680
00681 virtual ~IcuStringIterator()
00682 {
00683 delete this->char_itr;
00684 delete this->word_itr;
00685 }
00686
00687 virtual void SetString(const char *s)
00688 {
00689 this->string = s;
00690
00691
00692
00693
00694
00695 this->utf16_str.Clear();
00696 this->utf16_to_utf8.Clear();
00697
00698 while (*s != '\0') {
00699 size_t idx = s - this->string;
00700
00701 WChar c = Utf8Consume(&s);
00702 if (c < 0x10000) {
00703 *this->utf16_str.Append() = (UChar)c;
00704 } else {
00705
00706 *this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10));
00707 *this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF));
00708 *this->utf16_to_utf8.Append() = idx;
00709 }
00710 *this->utf16_to_utf8.Append() = idx;
00711 }
00712 *this->utf16_str.Append() = '\0';
00713 *this->utf16_to_utf8.Append() = s - this->string;
00714
00715 UText text = UTEXT_INITIALIZER;
00716 UErrorCode status = U_ZERO_ERROR;
00717 utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status);
00718 this->char_itr->setText(&text, status);
00719 this->word_itr->setText(&text, status);
00720 this->char_itr->first();
00721 this->word_itr->first();
00722 }
00723
00724 virtual size_t SetCurPosition(size_t pos)
00725 {
00726
00727 uint utf16_pos = 0;
00728 for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) {
00729 if (this->utf16_to_utf8[i] == pos) {
00730 utf16_pos = i;
00731 break;
00732 }
00733 }
00734
00735
00736
00737
00738 this->char_itr->isBoundary(utf16_pos);
00739 return this->utf16_to_utf8[this->char_itr->current()];
00740 }
00741
00742 virtual size_t Next(IterType what)
00743 {
00744 int32_t pos;
00745 switch (what) {
00746 case ITER_CHARACTER:
00747 pos = this->char_itr->next();
00748 break;
00749
00750 case ITER_WORD:
00751 pos = this->word_itr->following(this->char_itr->current());
00752
00753
00754
00755 while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next();
00756
00757 this->char_itr->isBoundary(pos);
00758 break;
00759
00760 default:
00761 NOT_REACHED();
00762 }
00763
00764 return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
00765 }
00766
00767 virtual size_t Prev(IterType what)
00768 {
00769 int32_t pos;
00770 switch (what) {
00771 case ITER_CHARACTER:
00772 pos = this->char_itr->previous();
00773 break;
00774
00775 case ITER_WORD:
00776 pos = this->word_itr->preceding(this->char_itr->current());
00777
00778
00779
00780 while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous();
00781
00782 this->char_itr->isBoundary(pos);
00783 break;
00784
00785 default:
00786 NOT_REACHED();
00787 }
00788
00789 return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
00790 }
00791 };
00792
00793 StringIterator *StringIterator::Create()
00794 {
00795 return new IcuStringIterator();
00796 }
00797
00798 #else
00799
00801 class DefaultStringIterator : public StringIterator
00802 {
00803 const char *string;
00804 size_t len;
00805 size_t cur_pos;
00806
00807 public:
00808 DefaultStringIterator() : string(NULL), len(0), cur_pos(0)
00809 {
00810 }
00811
00812 virtual void SetString(const char *s)
00813 {
00814 this->string = s;
00815 this->len = strlen(s);
00816 this->cur_pos = 0;
00817 }
00818
00819 virtual size_t SetCurPosition(size_t pos)
00820 {
00821 assert(this->string != NULL && pos <= this->len);
00822
00823 while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
00824 return this->cur_pos = pos;
00825 }
00826
00827 virtual size_t Next(IterType what)
00828 {
00829 assert(this->string != NULL);
00830
00831
00832 if (this->cur_pos >= this->len) return END;
00833
00834 switch (what) {
00835 case ITER_CHARACTER: {
00836 WChar c;
00837 this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
00838 return this->cur_pos;
00839 }
00840
00841 case ITER_WORD: {
00842 WChar c;
00843
00844 size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
00845 while (this->cur_pos < this->len && !IsWhitespace(c)) {
00846 this->cur_pos += offs;
00847 offs = Utf8Decode(&c, this->string + this->cur_pos);
00848 }
00849
00850 while (this->cur_pos < this->len && IsWhitespace(c)) {
00851 this->cur_pos += offs;
00852 offs = Utf8Decode(&c, this->string + this->cur_pos);
00853 }
00854
00855 return this->cur_pos;
00856 }
00857
00858 default:
00859 NOT_REACHED();
00860 }
00861
00862 return END;
00863 }
00864
00865 virtual size_t Prev(IterType what)
00866 {
00867 assert(this->string != NULL);
00868
00869
00870 if (this->cur_pos == 0) return END;
00871
00872 switch (what) {
00873 case ITER_CHARACTER:
00874 return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
00875
00876 case ITER_WORD: {
00877 const char *s = this->string + this->cur_pos;
00878 WChar c;
00879
00880 do {
00881 s = Utf8PrevChar(s);
00882 Utf8Decode(&c, s);
00883 } while (s > this->string && IsWhitespace(c));
00884
00885 while (s > this->string && !IsWhitespace(c)) {
00886 s = Utf8PrevChar(s);
00887 Utf8Decode(&c, s);
00888 }
00889
00890 if (IsWhitespace(c)) Utf8Consume(&s);
00891
00892 return this->cur_pos = s - this->string;
00893 }
00894
00895 default:
00896 NOT_REACHED();
00897 }
00898
00899 return END;
00900 }
00901 };
00902
00903 StringIterator *StringIterator::Create()
00904 {
00905 return new DefaultStringIterator();
00906 }
00907
00908 #endif