diff options
Diffstat (limited to 'chromium/third_party/WebKit/Source/core/platform/text/TextEncodingDetectorICU.cpp')
-rw-r--r-- | chromium/third_party/WebKit/Source/core/platform/text/TextEncodingDetectorICU.cpp | 117 |
1 files changed, 0 insertions, 117 deletions
diff --git a/chromium/third_party/WebKit/Source/core/platform/text/TextEncodingDetectorICU.cpp b/chromium/third_party/WebKit/Source/core/platform/text/TextEncodingDetectorICU.cpp deleted file mode 100644 index d0409d642e3..00000000000 --- a/chromium/third_party/WebKit/Source/core/platform/text/TextEncodingDetectorICU.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (C) 2008, 2009 Google Inc. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Google Inc. nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" -#include "core/platform/text/TextEncodingDetector.h" - -#include "wtf/text/TextEncoding.h" -#include <unicode/ucnv.h> -#include <unicode/ucsdet.h> - -namespace WebCore { - -bool detectTextEncoding(const char* data, size_t len, - const char* hintEncodingName, - WTF::TextEncoding* detectedEncoding) -{ - *detectedEncoding = WTF::TextEncoding(); - int matchesCount = 0; - UErrorCode status = U_ZERO_ERROR; - UCharsetDetector* detector = ucsdet_open(&status); - if (U_FAILURE(status)) - return false; - ucsdet_enableInputFilter(detector, true); - ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); - if (U_FAILURE(status)) - return false; - - // FIXME: A few things we can do other than improving - // the ICU detector itself. - // 1. Use ucsdet_detectAll and pick the most likely one given - // "the context" (parent-encoding, referrer encoding, etc). - // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. - // Chinese, Japanese, Russian, Korean and Hebrew) by picking the - // encoding with a highest confidence among the detector-specific - // limited set of candidate encodings. - // Below is a partial implementation of the first part of what's outlined - // above. - const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); - if (U_FAILURE(status)) { - ucsdet_close(detector); - return false; - } - - const char* encoding = 0; - if (hintEncodingName) { - WTF::TextEncoding hintEncoding(hintEncodingName); - // 10 is the minimum confidence value consistent with the codepoint - // allocation in a given encoding. The size of a chunk passed to - // us varies even for the same html file (apparently depending on - // the network load). When we're given a rather short chunk, we - // don't have a sufficiently reliable signal other than the fact that - // the chunk is consistent with a set of encodings. So, instead of - // setting an arbitrary threshold, we have to scan all the encodings - // consistent with the data. - const int32_t kThresold = 10; - for (int i = 0; i < matchesCount; ++i) { - int32_t confidence = ucsdet_getConfidence(matches[i], &status); - if (U_FAILURE(status)) { - status = U_ZERO_ERROR; - continue; - } - if (confidence < kThresold) - break; - const char* matchEncoding = ucsdet_getName(matches[i], &status); - if (U_FAILURE(status)) { - status = U_ZERO_ERROR; - continue; - } - if (WTF::TextEncoding(matchEncoding) == hintEncoding) { - encoding = hintEncodingName; - break; - } - } - } - // If no match is found so far, just pick the top match. - // This can happen, say, when a parent frame in EUC-JP refers to - // a child frame in Shift_JIS and both frames do NOT specify the encoding - // making us resort to auto-detection (when it IS turned on). - if (!encoding && matchesCount > 0) - encoding = ucsdet_getName(matches[0], &status); - if (U_SUCCESS(status)) { - *detectedEncoding = WTF::TextEncoding(encoding); - ucsdet_close(detector); - return true; - } - ucsdet_close(detector); - return false; -} - -} |