From 76367f6bf1b5b459c9a15faa0cc0ea1dab191c6f Mon Sep 17 00:00:00 2001 From: michi_cc Date: Mon, 5 Aug 2013 20:35:31 +0000 Subject: [PATCH] (svn r25653) -Add: Caret movement by words for CJK languages. --- src/string.cpp | 167 ++++++++++++++++++++++++++++++++++++++++----- src/string_base.h | 10 ++- src/string_func.h | 46 ++++++++++++- src/textbuf.cpp | 116 +++++-------------------------- src/textbuf_type.h | 4 -- 5 files changed, 220 insertions(+), 123 deletions(-) diff --git a/src/string.cpp b/src/string.cpp index bb1f2bbd07..ada9f9022a 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -661,50 +661,132 @@ int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front) class IcuStringIterator : public StringIterator { icu::BreakIterator *char_itr; ///< ICU iterator for characters. + icu::BreakIterator *word_itr; ///< ICU iterator for words. const char *string; ///< Iteration string in UTF-8. + SmallVector utf16_str; ///< UTF-16 copy of the string. + SmallVector utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string. + public: - IcuStringIterator() : char_itr(NULL) + IcuStringIterator() : char_itr(NULL), word_itr(NULL) { UErrorCode status = U_ZERO_ERROR; this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status); + this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status); + + *this->utf16_str.Append() = '\0'; + *this->utf16_to_utf8.Append() = 0; } virtual ~IcuStringIterator() { delete this->char_itr; + delete this->word_itr; } virtual void SetString(const char *s) { this->string = s; + /* Unfortunately current ICU versions only provide rudimentary support + * for word break iterators (especially for CJK languages) in combination + * with UTF-8 input. As a work around we have to convert the input to + * UTF-16 and create a mapping back to UTF-8 character indices. */ + this->utf16_str.Clear(); + this->utf16_to_utf8.Clear(); + + while (*s != '\0') { + size_t idx = s - this->string; + + WChar c = Utf8Consume(&s); + if (c < 0x10000) { + *this->utf16_str.Append() = (UChar)c; + } else { + /* Make a surrogate pair. */ + *this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10)); + *this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF)); + *this->utf16_to_utf8.Append() = idx; + } + *this->utf16_to_utf8.Append() = idx; + } + *this->utf16_str.Append() = '\0'; + *this->utf16_to_utf8.Append() = s - this->string; + UText text = UTEXT_INITIALIZER; UErrorCode status = U_ZERO_ERROR; - utext_openUTF8(&text, s, -1, &status); + utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status); this->char_itr->setText(&text, status); + this->word_itr->setText(&text, status); this->char_itr->first(); + this->word_itr->first(); } virtual size_t SetCurPosition(size_t pos) { + /* Convert incoming position to an UTF-16 string index. */ + uint utf16_pos = 0; + for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) { + if (this->utf16_to_utf8[i] == pos) { + utf16_pos = i; + break; + } + } + /* isBoundary has the documented side-effect of setting the current * position to the first valid boundary equal to or greater than * the passed value. */ - this->char_itr->isBoundary((int32_t)pos); - return this->char_itr->current(); + this->char_itr->isBoundary(utf16_pos); + return this->utf16_to_utf8[this->char_itr->current()]; } - virtual size_t Next() + virtual size_t Next(IterType what) { - int32_t pos = this->char_itr->next(); - return pos == icu::BreakIterator::DONE ? END : pos; + int32_t pos; + switch (what) { + case ITER_CHARACTER: + pos = this->char_itr->next(); + break; + + case ITER_WORD: + pos = this->word_itr->following(this->char_itr->current()); + /* The ICU word iterator considers both the start and the end of a word a valid + * break point, but we only want word starts. Move to the next location in + * case the new position points to whitespace. */ + while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next(); + + this->char_itr->isBoundary(pos); + break; + + default: + NOT_REACHED(); + } + + return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos]; } - virtual size_t Prev() + virtual size_t Prev(IterType what) { - int32_t pos = this->char_itr->previous(); - return pos == icu::BreakIterator::DONE ? END : pos; + int32_t pos; + switch (what) { + case ITER_CHARACTER: + pos = this->char_itr->previous(); + break; + + case ITER_WORD: + pos = this->word_itr->preceding(this->char_itr->current()); + /* The ICU word iterator considers both the start and the end of a word a valid + * break point, but we only want word starts. Move to the previous location in + * case the new position points to whitespace. */ + while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous(); + + this->char_itr->isBoundary(pos); + break; + + default: + NOT_REACHED(); + } + + return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos]; } }; @@ -742,26 +824,79 @@ public: return this->cur_pos = pos; } - virtual size_t Next() + virtual size_t Next(IterType what) { assert(this->string != NULL); /* Already at the end? */ if (this->cur_pos >= this->len) return END; - WChar c; - this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos); - return this->cur_pos; + switch (what) { + case ITER_CHARACTER: { + WChar c; + this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos); + return this->cur_pos; + } + + case ITER_WORD: { + WChar c; + /* Consume current word. */ + size_t offs = Utf8Decode(&c, this->string + this->cur_pos); + while (this->cur_pos < this->len && !IsWhitespace(c)) { + this->cur_pos += offs; + offs = Utf8Decode(&c, this->string + this->cur_pos); + } + /* Consume whitespace to the next word. */ + while (this->cur_pos < this->len && IsWhitespace(c)) { + this->cur_pos += offs; + offs = Utf8Decode(&c, this->string + this->cur_pos); + } + + return this->cur_pos; + } + + default: + NOT_REACHED(); + } + + return END; } - virtual size_t Prev() + virtual size_t Prev(IterType what) { assert(this->string != NULL); /* Already at the beginning? */ if (this->cur_pos == 0) return END; - return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string; + switch (what) { + case ITER_CHARACTER: + return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string; + + case ITER_WORD: { + const char *s = this->string + this->cur_pos; + WChar c; + /* Consume preceding whitespace. */ + do { + s = Utf8PrevChar(s); + Utf8Decode(&c, s); + } while (s > this->string && IsWhitespace(c)); + /* Consume preceding word. */ + while (s > this->string && !IsWhitespace(c)) { + s = Utf8PrevChar(s); + Utf8Decode(&c, s); + } + /* Move caret back to the beginning of the word. */ + if (IsWhitespace(c)) Utf8Consume(&s); + + return this->cur_pos = s - this->string; + } + + default: + NOT_REACHED(); + } + + return END; } }; diff --git a/src/string_base.h b/src/string_base.h index 73439f6393..e1eaed3496 100644 --- a/src/string_base.h +++ b/src/string_base.h @@ -15,6 +15,12 @@ /** Class for iterating over different kind of parts of a string. */ class StringIterator { public: + /** Type of the iterator. */ + enum IterType { + ITER_CHARACTER, ///< Iterate over characters (or more exactly grapheme clusters). + ITER_WORD, ///< Iterate over words. + }; + /** Sentinel to indicate end-of-iteration. */ static const size_t END = SIZE_MAX; @@ -45,13 +51,13 @@ public: * Advance the cursor by one iteration unit. * @return New cursor position (in bytes) or #END if the cursor is already at the end of the string. */ - virtual size_t Next() = 0; + virtual size_t Next(IterType what = ITER_CHARACTER) = 0; /** * Move the cursor back by one iteration unit. * @return New cursor position (in bytes) or #END if the cursor is already at the start of the string. */ - virtual size_t Prev() = 0; + virtual size_t Prev(IterType what = ITER_CHARACTER) = 0; protected: StringIterator() {} diff --git a/src/string_func.h b/src/string_func.h index b0a42b8085..d7056f1be1 100644 --- a/src/string_func.h +++ b/src/string_func.h @@ -90,7 +90,6 @@ static inline WChar Utf8Consume(const char **s) return c; } - /** * Return the length of a UTF-8 encoded character. * @param c Unicode character. @@ -156,6 +155,51 @@ static inline const char *Utf8PrevChar(const char *s) size_t Utf8StringLength(const char *s); +/** + * Is the given character a lead surrogate code point? + * @param c The character to test. + * @return True if the character is a lead surrogate code point. + */ +static inline bool Utf16IsLeadSurrogate(uint c) +{ + return c >= 0xD800 && c <= 0xDBFF; +} + +/** + * Is the given character a lead surrogate code point? + * @param c The character to test. + * @return True if the character is a lead surrogate code point. + */ +static inline bool Utf16IsTrailSurrogate(uint c) +{ + return c >= 0xDC00 && c <= 0xDFFF; +} + +/** + * Convert an UTF-16 surrogate pair to the corresponding Unicode character. + * @param lead Lead surrogate code point. + * @param trail Trail surrogate code point. + * @return Decoded Unicode character. + */ +static inline WChar Utf16DecodeSurrogate(uint lead, uint trail) +{ + return 0x10000 + (((lead - 0xD800) << 10) | (trail - 0xDC00)); +} + +/** + * Decode an UTF-16 character. + * @param c Pointer to one or two UTF-16 code points. + * @return Decoded Unicode character. + */ +static inline WChar Utf16DecodeChar(const uint16 *c) +{ + if (Utf16IsLeadSurrogate(c[0])) { + return Utf16DecodeSurrogate(c[0], c[1]); + } else { + return *c; + } +} + /** * Is the given character a text direction character. * @param c The character to test. diff --git a/src/textbuf.cpp b/src/textbuf.cpp index 6ea042244b..9a307058fb 100644 --- a/src/textbuf.cpp +++ b/src/textbuf.cpp @@ -219,70 +219,12 @@ bool Textbuf::InsertClipboard() return true; } -/** - * Checks if it is possible to move caret to the left - * @return true if the caret can be moved to the left, otherwise false. - */ -bool Textbuf::CanMoveCaretLeft() -{ - return this->caretpos != 0; -} - -/** - * Moves the caret to the left. - * @pre Ensure that Textbuf::CanMoveCaretLeft returns true - * @return The character under the caret. - */ -WChar Textbuf::MoveCaretLeft() -{ - assert(this->CanMoveCaretLeft()); - - size_t pos = this->char_iter->Prev(); - if (pos == StringIterator::END) pos = 0; - - this->caretpos = (uint16)pos; - this->UpdateCaretPosition(); - - WChar c; - Utf8Decode(&c, this->buf + this->caretpos); - - return c; -} - -/** - * Checks if it is possible to move caret to the right - * @return true if the caret can be moved to the right, otherwise false. - */ -bool Textbuf::CanMoveCaretRight() -{ - return this->caretpos < this->bytes - 1; -} - -/** - * Moves the caret to the right. - * @pre Ensure that Textbuf::CanMoveCaretRight returns true - * @return The character under the caret. - */ -WChar Textbuf::MoveCaretRight() -{ - assert(this->CanMoveCaretRight()); - - size_t pos = this->char_iter->Next(); - if (pos == StringIterator::END) pos = this->bytes - 1; - - this->caretpos = (uint16)pos; - this->UpdateCaretPosition(); - - WChar c; - Utf8Decode(&c, this->buf + this->caretpos); - return c; -} - /** Update the character iter after the text has changed. */ void Textbuf::UpdateStringIter() { this->char_iter->SetString(this->buf); - this->caretpos = (uint16)this->char_iter->SetCurPosition(this->caretpos); + size_t pos = this->char_iter->SetCurPosition(this->caretpos); + this->caretpos = pos == StringIterator::END ? 0 : (uint16)pos; } /** Update pixel width of the text. */ @@ -307,64 +249,38 @@ bool Textbuf::MovePos(uint16 keycode) { switch (keycode) { case WKC_LEFT: - if (this->CanMoveCaretLeft()) { - this->MoveCaretLeft(); - return true; - } - break; - case WKC_CTRL | WKC_LEFT: { - if (!this->CanMoveCaretLeft()) break; + if (this->caretpos == 0) break; - /* Unconditionally move one char to the left. */ - WChar c = this->MoveCaretLeft(); - /* Consume left whitespaces. */ - while (IsWhitespace(c)) { - if (!this->CanMoveCaretLeft()) return true; - c = this->MoveCaretLeft(); - } - /* Consume left word. */ - while (!IsWhitespace(c)) { - if (!this->CanMoveCaretLeft()) return true; - c = this->MoveCaretLeft(); - } - /* Place caret at the beginning of the left word. */ - this->MoveCaretRight(); + size_t pos = this->char_iter->Prev(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER); + if (pos == StringIterator::END) return true; + + this->caretpos = (uint16)pos; + this->UpdateCaretPosition(); return true; } case WKC_RIGHT: - if (this->CanMoveCaretRight()) { - this->MoveCaretRight(); - return true; - } - break; - case WKC_CTRL | WKC_RIGHT: { - if (!this->CanMoveCaretRight()) break; + if (this->caretpos >= this->bytes - 1) break; - /* Unconditionally move one char to the right. */ - WChar c = this->MoveCaretRight(); - /* Continue to consume current word. */ - while (!IsWhitespace(c)) { - if (!this->CanMoveCaretRight()) return true; - c = this->MoveCaretRight(); - } - /* Consume right whitespaces. */ - while (IsWhitespace(c)) { - if (!this->CanMoveCaretRight()) return true; - c = this->MoveCaretRight(); - } + size_t pos = this->char_iter->Next(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER); + if (pos == StringIterator::END) return true; + + this->caretpos = (uint16)pos; + this->UpdateCaretPosition(); return true; } case WKC_HOME: this->caretpos = 0; + this->char_iter->SetCurPosition(this->caretpos); this->UpdateCaretPosition(); return true; case WKC_END: this->caretpos = this->bytes - 1; + this->char_iter->SetCurPosition(this->caretpos); this->UpdateCaretPosition(); return true; diff --git a/src/textbuf_type.h b/src/textbuf_type.h index 611d7e4436..f5100249cc 100644 --- a/src/textbuf_type.h +++ b/src/textbuf_type.h @@ -67,10 +67,6 @@ private: bool CanDelChar(bool backspace); WChar GetNextDelChar(bool backspace); void DelChar(bool backspace); - bool CanMoveCaretLeft(); - WChar MoveCaretLeft(); - bool CanMoveCaretRight(); - WChar MoveCaretRight(); void UpdateStringIter(); void UpdateWidth();