#include #include #include #include #include #include namespace mbgl { class BiDiImpl { public: BiDiImpl() : bidiText(ubidi_open()), bidiLine(ubidi_open()) { } ~BiDiImpl() { ubidi_close(bidiText); ubidi_close(bidiLine); } UBiDi* bidiText = nullptr; UBiDi* bidiLine = nullptr; }; BiDi::BiDi() : impl(std::make_unique()) {} BiDi::~BiDi() = default; // Takes UTF16 input in logical order and applies Arabic shaping to the input while maintaining // logical order. Output won't be intelligible until the bidirectional algorithm is applied std::u16string applyArabicShaping(const std::u16string& input) { UErrorCode errorCode = U_ZERO_ERROR; const int32_t outputLength = u_shapeArabic(mbgl::utf16char_cast(input.c_str()), static_cast(input.size()), nullptr, 0, (U_SHAPE_LETTERS_SHAPE & U_SHAPE_LETTERS_MASK) | (U_SHAPE_TEXT_DIRECTION_LOGICAL & U_SHAPE_TEXT_DIRECTION_MASK), &errorCode); // Pre-flighting will always set U_BUFFER_OVERFLOW_ERROR errorCode = U_ZERO_ERROR; std::u16string outputText(outputLength, 0); u_shapeArabic(mbgl::utf16char_cast(input.c_str()), static_cast(input.size()), mbgl::utf16char_cast(&outputText[0]), outputLength, (U_SHAPE_LETTERS_SHAPE & U_SHAPE_LETTERS_MASK) | (U_SHAPE_TEXT_DIRECTION_LOGICAL & U_SHAPE_TEXT_DIRECTION_MASK), &errorCode); // If the algorithm fails for any reason, fall back to non-transformed text if (U_FAILURE(errorCode)) return input; return outputText; } void BiDi::mergeParagraphLineBreaks(std::set& lineBreakPoints) { int32_t paragraphCount = ubidi_countParagraphs(impl->bidiText); for (int32_t i = 0; i < paragraphCount; i++) { UErrorCode errorCode = U_ZERO_ERROR; int32_t paragraphEndIndex; ubidi_getParagraphByIndex(impl->bidiText, i, nullptr, ¶graphEndIndex, nullptr, &errorCode); if (U_FAILURE(errorCode)) { throw std::runtime_error(std::string("ProcessedBiDiText::mergeParagraphLineBreaks: ") + u_errorName(errorCode)); } lineBreakPoints.insert(static_cast(paragraphEndIndex)); } } std::vector BiDi::applyLineBreaking(std::set lineBreakPoints) { // BiDi::getLine will error if called across a paragraph boundary, so we need to ensure that all // paragraph boundaries are included in the set of line break points. The calling code might not // include the line break because it didn't need to wrap at that point, or because the text was // separated with a more exotic code point such as (U+001C) mergeParagraphLineBreaks(lineBreakPoints); std::vector transformedLines; transformedLines.reserve(lineBreakPoints.size()); std::size_t start = 0; for (std::size_t lineBreakPoint : lineBreakPoints) { transformedLines.push_back(getLine(start, lineBreakPoint)); start = lineBreakPoint; } return transformedLines; } std::vector BiDi::processText(const std::u16string& input, std::set lineBreakPoints) { UErrorCode errorCode = U_ZERO_ERROR; ubidi_setPara(impl->bidiText, mbgl::utf16char_cast(input.c_str()), static_cast(input.size()), UBIDI_DEFAULT_LTR, nullptr, &errorCode); if (U_FAILURE(errorCode)) { throw std::runtime_error(std::string("BiDi::processText: ") + u_errorName(errorCode)); } return applyLineBreaking(std::move(lineBreakPoints)); } std::vector BiDi::processStyledText(const StyledText& input, std::set lineBreakPoints) { std::vector lines; const auto& inputText = input.first; const auto& styleIndices = input.second; UErrorCode errorCode = U_ZERO_ERROR; ubidi_setPara(impl->bidiText, mbgl::utf16char_cast(inputText.c_str()), static_cast(inputText.size()), UBIDI_DEFAULT_LTR, nullptr, &errorCode); if (U_FAILURE(errorCode)) { throw std::runtime_error(std::string("BiDi::processStyledText: ") + u_errorName(errorCode)); } mergeParagraphLineBreaks(lineBreakPoints); std::size_t lineStartIndex = 0; for (std::size_t lineBreakPoint : lineBreakPoints) { StyledText line; line.second.reserve(lineBreakPoint - lineStartIndex); errorCode = U_ZERO_ERROR; ubidi_setLine(impl->bidiText, static_cast(lineStartIndex), static_cast(lineBreakPoint), impl->bidiLine, &errorCode); if (U_FAILURE(errorCode)) { throw std::runtime_error(std::string("BiDi::processStyledText (setLine): ") + u_errorName(errorCode)); } errorCode = U_ZERO_ERROR; uint32_t runCount = ubidi_countRuns(impl->bidiLine, &errorCode); if (U_FAILURE(errorCode)) { throw std::runtime_error(std::string("BiDi::processStyledText (countRuns): ") + u_errorName(errorCode)); } for (uint32_t runIndex = 0; runIndex < runCount; runIndex++) { int32_t runLogicalStart; int32_t runLength; UBiDiDirection direction = ubidi_getVisualRun(impl->bidiLine, runIndex, &runLogicalStart, &runLength); const bool isReversed = direction == UBIDI_RTL; std::size_t logicalStart = lineStartIndex + runLogicalStart; std::size_t logicalEnd = logicalStart + runLength; if (isReversed) { // Within this reversed section, iterate logically backwards // Each time we see a change in style, render a reversed chunk // of everything since the last change std::size_t styleRunStart = logicalEnd; uint8_t currentStyleIndex = styleIndices.at(styleRunStart - 1); for (std::size_t i = logicalEnd - 1; i >= logicalStart; i--) { if (currentStyleIndex != styleIndices.at(i) || i == logicalStart) { std::size_t styleRunEnd = i == logicalStart ? i : i + 1; std::u16string reversed = writeReverse(inputText, styleRunEnd, styleRunStart); line.first += reversed; for (std::size_t j = 0; j < reversed.size(); j++) { line.second.push_back(currentStyleIndex); } currentStyleIndex = styleIndices.at(i); styleRunStart = styleRunEnd; } if (i == 0) { break; } } } else { line.first += input.first.substr(logicalStart, runLength); line.second.insert(line.second.end(), styleIndices.begin() + logicalStart, styleIndices.begin() + logicalStart + runLength); } } lines.push_back(line); lineStartIndex = lineBreakPoint; } return lines; } std::u16string BiDi::writeReverse(const std::u16string& input, std::size_t logicalStart, std::size_t logicalEnd) { UErrorCode errorCode = U_ZERO_ERROR; auto logicalLength = static_cast(logicalEnd - logicalStart); std::u16string outputText(logicalLength + 1, 0); // UBIDI_DO_MIRRORING: Apply unicode mirroring of characters like parentheses // UBIDI_REMOVE_BIDI_CONTROLS: Now that all the lines are set, remove control characters so that // they don't show up on screen (some fonts have glyphs representing them) int32_t outputLength = ubidi_writeReverse(mbgl::utf16char_cast(&input[logicalStart]), logicalLength, mbgl::utf16char_cast(&outputText[0]), logicalLength + 1, // Extra room for null terminator, although we don't really need to have ICU write it for us UBIDI_DO_MIRRORING | UBIDI_REMOVE_BIDI_CONTROLS, &errorCode); if (U_FAILURE(errorCode)) { throw std::runtime_error(std::string("BiDi::writeReverse: ") + u_errorName(errorCode)); } outputText.resize(outputLength); // REMOVE_BIDI_CONTROLS may have shrunk the string return outputText; } std::u16string BiDi::getLine(std::size_t start, std::size_t end) { UErrorCode errorCode = U_ZERO_ERROR; ubidi_setLine(impl->bidiText, static_cast(start), static_cast(end), impl->bidiLine, &errorCode); if (U_FAILURE(errorCode)) { throw std::runtime_error(std::string("BiDi::getLine (setLine): ") + u_errorName(errorCode)); } // Because we set UBIDI_REMOVE_BIDI_CONTROLS, the output may be smaller than what we reserve // Setting UBIDI_INSERT_LRM_FOR_NUMERIC would require // ubidi_getLength(pBiDi)+2*ubidi_countRuns(pBiDi) const int32_t outputLength = ubidi_getProcessedLength(impl->bidiLine); std::u16string outputText(outputLength, 0); // UBIDI_DO_MIRRORING: Apply unicode mirroring of characters like parentheses // UBIDI_REMOVE_BIDI_CONTROLS: Now that all the lines are set, remove control characters so that // they don't show up on screen (some fonts have glyphs representing them) int32_t finalLength = ubidi_writeReordered(impl->bidiLine, mbgl::utf16char_cast(&outputText[0]), outputLength, UBIDI_DO_MIRRORING | UBIDI_REMOVE_BIDI_CONTROLS, &errorCode); outputText.resize(finalLength); // REMOVE_BIDI_CONTROLS may have shrunk the string if (U_FAILURE(errorCode)) { throw std::runtime_error(std::string("BiDi::getLine (writeReordered): ") + u_errorName(errorCode)); } return outputText; } } // end namespace mbgl