diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-10-26 13:57:00 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-11-02 11:31:01 +0000 |
commit | 1943b3c2a1dcee36c233724fc4ee7613d71b9cf6 (patch) | |
tree | 8c1b5f12357025c197da5427ae02cfdc2f3570d6 /chromium/third_party/cld_3/src | |
parent | 21ba0c5d4bf8fba15dddd97cd693bad2358b77fd (diff) | |
download | qtwebengine-chromium-1943b3c2a1dcee36c233724fc4ee7613d71b9cf6.tar.gz |
BASELINE: Update Chromium to 94.0.4606.111
Change-Id: I924781584def20fc800bedf6ff41fdb96c438193
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/third_party/cld_3/src')
-rw-r--r-- | chromium/third_party/cld_3/src/.github/workflows/main.yml | 139 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/CMakeLists.txt | 2 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/MANIFEST.in | 13 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/README.md | 120 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/gcld3/__init__.py | 1 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/gcld3/pybind_ext.cc | 43 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/requirements.txt | 3 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/setup.py | 120 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h | 4 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/src/sentence_features.cc | 8 | ||||
-rw-r--r-- | chromium/third_party/cld_3/src/src/sentence_features.h | 16 |
11 files changed, 458 insertions, 11 deletions
diff --git a/chromium/third_party/cld_3/src/.github/workflows/main.yml b/chromium/third_party/cld_3/src/.github/workflows/main.yml new file mode 100644 index 00000000000..74e463a2f0b --- /dev/null +++ b/chromium/third_party/cld_3/src/.github/workflows/main.yml @@ -0,0 +1,139 @@ +name: gcld3 + +on: [push, pull_request] + +jobs: + + test: + name: ${{ matrix.os }}-${{matrix.python-version}}-test + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest] + python-version: [3.6, 3.7, 3.8, pypy3] + + steps: + - uses: actions/checkout@v2 + + - name: Linux Dependencies + if: runner.os == 'Linux' + run: sudo apt-get install libprotobuf-dev protobuf-compiler python3-dev + + - name: MacOS Dependencies + if: runner.os == 'macOS' + run: brew install protobuf + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + + - name: Build package + run: | + pip install setuptools + python setup.py install + + - name: Test with pytest + run: | + pip install pytest pytest-cov + pytest gcld3/tests/gcld3_test.py + + + sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + name: Install Python + with: + python-version: "3.8" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel + - name: Build sdist + run: python setup.py sdist + + - uses: actions/upload-artifact@v2 + with: + path: dist/*.tar.gz + + wheel: + name: ${{ matrix.os }}-wheel + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install cibuildwheel + run: | + python -m pip install cibuildwheel==1.5.5 auditwheel delocate + + - name: Build + env: + CIBW_BUILD: "cp36-* cp38-* pp36-*" + CIBW_SKIP: "*-win32 *-manylinux_i686 pp27-* cp27-* cp35-*" + CIBW_BEFORE_BUILD_LINUX: yum -y install protobuf-devel protobuf-compiler python3-devel + CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair --lib-sdir . -w {dest_dir} {wheel}" + CIBW_BEFORE_BUILD_MACOS: brew install protobuf + CIBW_REPAIR_WHEEL_COMMAND_MACOS: "delocate-listdeps {wheel} && delocate-wheel -w {dest_dir} -v {wheel}" + run: | + python -m cibuildwheel --output-dir wheelhouse + + - uses: actions/upload-artifact@v2 + with: + path: ./wheelhouse/*.whl + + + pypi: + needs: [wheel, sdist] + runs-on: ubuntu-latest + # upload to PyPI on every tag starting with 'v' + if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') + # alternatively, to publish when a GitHub Release is created, use the following rule: + # if: github.event_name == 'release' && github.event.action == 'published' + steps: + - uses: actions/download-artifact@v2 + with: + name: artifact + path: dist + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install twine + + - name: Upload to test pypi + env: + TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }} + run: | + twine upload --repository-url https://test.pypi.org/legacy/ dist/* + + - name: Upload to pypi + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + twine upload dist/* diff --git a/chromium/third_party/cld_3/src/CMakeLists.txt b/chromium/third_party/cld_3/src/CMakeLists.txt index 732a8ae67b1..2fa3908799c 100644 --- a/chromium/third_party/cld_3/src/CMakeLists.txt +++ b/chromium/third_party/cld_3/src/CMakeLists.txt @@ -24,7 +24,7 @@ add_definitions(-fPIC) # Position Independant Code add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) add_definitions(-std=c++11) # Needed for std::to_string(), ... -include_directories(${CMAKE_CURRENT_BINARY_DIR}) # needed to include generated pb headers +include_directories(${CMAKE_CURRENT_BINARY_DIR} ${Protobuf_INCLUDE_DIRS}) # needed to include generated pb headers add_library(${PROJECT_NAME} ${PROTO_SRCS} ${PROTO_HDRS} diff --git a/chromium/third_party/cld_3/src/MANIFEST.in b/chromium/third_party/cld_3/src/MANIFEST.in new file mode 100644 index 00000000000..9fb3e4859cd --- /dev/null +++ b/chromium/third_party/cld_3/src/MANIFEST.in @@ -0,0 +1,13 @@ +include LICENSE +include README.md +include requirements.txt +global-include *h +global-include *cc +global-include *proto +prune .github/ +prune .eggs/ +global-exclude *.pyc +global-exclude *.cache +global-exclude *.so +exclude src/cld_3/protos/*h +exclude src/cld_3/protos/*cc diff --git a/chromium/third_party/cld_3/src/README.md b/chromium/third_party/cld_3/src/README.md index 86008330c43..462dd8e60e3 100644 --- a/chromium/third_party/cld_3/src/README.md +++ b/chromium/third_party/cld_3/src/README.md @@ -1,8 +1,9 @@ # Compact Language Detector v3 (CLD3) * [Model](#model) +* [Supported Languages](#supported-languages) * [Installation](#installation) -* [Contact](#contact) +* [Bugs and Feature Requests](#bugs-and-feature-requests) * [Credits](#credits) ### Model @@ -26,6 +27,123 @@ To get a language prediction for the input text, we simply perform a forward ![Figure](model.png "CLD3") +### Supported Languages + +The model outputs BCP-47-style language codes, shown in the table below. For +some languages, output is differentiated by script. Language and script names +from +[Unicode CLDR](https://github.com/unicode-cldr/cldr-localenames-modern/blob/master/main/en). + +Output Code | Language Name | Script Name +----------- | --------------- | ------------------------------------------ +af | Afrikaans | Latin +am | Amharic | Ethiopic +ar | Arabic | Arabic +bg | Bulgarian | Cyrillic +bg-Latn | Bulgarian | Latin +bn | Bangla | Bangla +bs | Bosnian | Latin +ca | Catalan | Latin +ceb | Cebuano | Latin +co | Corsican | Latin +cs | Czech | Latin +cy | Welsh | Latin +da | Danish | Latin +de | German | Latin +el | Greek | Greek +el-Latn | Greek | Latin +en | English | Latin +eo | Esperanto | Latin +es | Spanish | Latin +et | Estonian | Latin +eu | Basque | Latin +fa | Persian | Arabic +fi | Finnish | Latin +fil | Filipino | Latin +fr | French | Latin +fy | Western Frisian | Latin +ga | Irish | Latin +gd | Scottish Gaelic | Latin +gl | Galician | Latin +gu | Gujarati | Gujarati +ha | Hausa | Latin +haw | Hawaiian | Latin +hi | Hindi | Devanagari +hi-Latn | Hindi | Latin +hmn | Hmong | Latin +hr | Croatian | Latin +ht | Haitian Creole | Latin +hu | Hungarian | Latin +hy | Armenian | Armenian +id | Indonesian | Latin +ig | Igbo | Latin +is | Icelandic | Latin +it | Italian | Latin +iw | Hebrew | Hebrew +ja | Japanese | Japanese +ja-Latn | Japanese | Latin +jv | Javanese | Latin +ka | Georgian | Georgian +kk | Kazakh | Cyrillic +km | Khmer | Khmer +kn | Kannada | Kannada +ko | Korean | Korean +ku | Kurdish | Latin +ky | Kyrgyz | Cyrillic +la | Latin | Latin +lb | Luxembourgish | Latin +lo | Lao | Lao +lt | Lithuanian | Latin +lv | Latvian | Latin +mg | Malagasy | Latin +mi | Maori | Latin +mk | Macedonian | Cyrillic +ml | Malayalam | Malayalam +mn | Mongolian | Cyrillic +mr | Marathi | Devanagari +ms | Malay | Latin +mt | Maltese | Latin +my | Burmese | Myanmar +ne | Nepali | Devanagari +nl | Dutch | Latin +no | Norwegian | Latin +ny | Nyanja | Latin +pa | Punjabi | Gurmukhi +pl | Polish | Latin +ps | Pashto | Arabic +pt | Portuguese | Latin +ro | Romanian | Latin +ru | Russian | Cyrillic +ru-Latn | Russian | English +sd | Sindhi | Arabic +si | Sinhala | Sinhala +sk | Slovak | Latin +sl | Slovenian | Latin +sm | Samoan | Latin +sn | Shona | Latin +so | Somali | Latin +sq | Albanian | Latin +sr | Serbian | Cyrillic +st | Southern Sotho | Latin +su | Sundanese | Latin +sv | Swedish | Latin +sw | Swahili | Latin +ta | Tamil | Tamil +te | Telugu | Telugu +tg | Tajik | Cyrillic +th | Thai | Thai +tr | Turkish | Latin +uk | Ukrainian | Cyrillic +ur | Urdu | Arabic +uz | Uzbek | Latin +vi | Vietnamese | Latin +xh | Xhosa | Latin +yi | Yiddish | Hebrew +yo | Yoruba | Latin +zh | Chinese | Han (including Simplified and Traditional) +zh-Latn | Chinese | Latin +zu | Zulu | Latin + ### Installation CLD3 is designed to run in the Chrome browser, so it relies on code in [Chromium](http://www.chromium.org/). diff --git a/chromium/third_party/cld_3/src/gcld3/__init__.py b/chromium/third_party/cld_3/src/gcld3/__init__.py new file mode 100644 index 00000000000..b7d4dcc3f75 --- /dev/null +++ b/chromium/third_party/cld_3/src/gcld3/__init__.py @@ -0,0 +1 @@ +from .pybind_ext import * diff --git a/chromium/third_party/cld_3/src/gcld3/pybind_ext.cc b/chromium/third_party/cld_3/src/gcld3/pybind_ext.cc new file mode 100644 index 00000000000..024d6d56d89 --- /dev/null +++ b/chromium/third_party/cld_3/src/gcld3/pybind_ext.cc @@ -0,0 +1,43 @@ +#include <pybind11/pybind11.h> +#include <pybind11/pytypes.h> +#include <pybind11/stl.h> + +#include "../src/nnet_language_identifier.h" + +namespace pybind11 { + +using chrome_lang_id::NNetLanguageIdentifier; + +// This is conventional. +namespace py = pybind11; + +PYBIND11_MODULE(pybind_ext, py_module) { + py::class_<NNetLanguageIdentifier>(py_module, "NNetLanguageIdentifier") + .def(py::init<const int, const int>(), py::arg("min_num_bytes"), + py::arg("max_num_bytes")) + .def("FindLanguage", &NNetLanguageIdentifier::FindLanguage, + py::arg("text")) + .def("FindTopNMostFreqLangs", + &NNetLanguageIdentifier::FindTopNMostFreqLangs, py::arg("text"), + py::arg("num_langs")) + .def_readonly_static("kUnknown", &NNetLanguageIdentifier::kUnknown) + .def_readonly_static("kMinNumBytesToConsider", + &NNetLanguageIdentifier::kMinNumBytesToConsider) + .def_readonly_static("kMaxNumBytesToConsider", + &NNetLanguageIdentifier::kMaxNumBytesToConsider) + .def_readonly_static("kMaxNumInputBytesToConsider", + &NNetLanguageIdentifier::kMaxNumInputBytesToConsider) + .def_readonly_static("kReliabilityThreshold", + &NNetLanguageIdentifier::kReliabilityThreshold) + .def_readonly_static("kReliabilityHrBsThreshold", + &NNetLanguageIdentifier::kReliabilityHrBsThreshold); + + py::class_<NNetLanguageIdentifier::Result>(py_module, "Result") + .def_readwrite("language", &NNetLanguageIdentifier::Result::language) + .def_readwrite("probability", + &NNetLanguageIdentifier::Result::probability) + .def_readwrite("is_reliable", + &NNetLanguageIdentifier::Result::is_reliable) + .def_readwrite("proportion", &NNetLanguageIdentifier::Result::proportion); +} +} // namespace pybind11 diff --git a/chromium/third_party/cld_3/src/requirements.txt b/chromium/third_party/cld_3/src/requirements.txt new file mode 100644 index 00000000000..8d0ba11fa12 --- /dev/null +++ b/chromium/third_party/cld_3/src/requirements.txt @@ -0,0 +1,3 @@ +protobuf >=3.0.0 +pybind11 >=2.5.0 +wheel >= 0.34.2 diff --git a/chromium/third_party/cld_3/src/setup.py b/chromium/third_party/cld_3/src/setup.py new file mode 100644 index 00000000000..385189fc99e --- /dev/null +++ b/chromium/third_party/cld_3/src/setup.py @@ -0,0 +1,120 @@ +"""Setup utility for gcld3.""" + +import os +import platform +import shutil +import subprocess +import setuptools +from setuptools.command import build_ext + +__version__ = '3.0.13' +_NAME = 'gcld3' + +REQUIREMENTS = ['pybind11 >= 2.5.0', 'wheel >= 0.34.2'] + +PROTO_FILES = [ + 'src/feature_extractor.proto', + 'src/sentence.proto', + 'src/task_spec.proto', +] + +SRCS = [ + 'src/base.cc', + 'src/embedding_feature_extractor.cc', + 'src/embedding_network.cc', + 'src/feature_extractor.cc', + 'src/feature_types.cc', + 'src/fml_parser.cc', + 'src/lang_id_nn_params.cc', + 'src/language_identifier_features.cc', + 'src/language_identifier_main.cc', + 'src/nnet_language_identifier.cc', + 'src/registry.cc', + 'src/relevant_script_feature.cc', + 'src/sentence_features.cc', + 'src/task_context.cc', + 'src/task_context_params.cc', + 'src/unicodetext.cc', + 'src/utils.cc', + 'src/workspace.cc', + 'src/script_span/fixunicodevalue.cc', + 'src/script_span/generated_entities.cc', + 'src/script_span/generated_ulscript.cc', + 'src/script_span/getonescriptspan.cc', + 'src/script_span/offsetmap.cc', + 'src/script_span/text_processing.cc', + 'src/script_span/utf8statetable.cc', + # These CC files have to be generated by the proto buffer compiler 'protoc' + 'src/cld_3/protos/feature_extractor.pb.cc', + 'src/cld_3/protos/sentence.pb.cc', + 'src/cld_3/protos/task_spec.pb.cc', + # pybind11 bindings + 'gcld3/pybind_ext.cc', +] + + +class CompileProtos(build_ext.build_ext): + """Compile protocol buffers via `protoc` compiler.""" + + def run(self): + if shutil.which('protoc') is None: + raise RuntimeError('Please install the proto buffer compiler.') + + # The C++ code expect the protos to be compiled under the following + # directory, therefore, create it if necessary. + compiled_protos_dir = 'src/cld_3/protos/' + os.makedirs(compiled_protos_dir, exist_ok=True) + command = ['protoc', f'--cpp_out={compiled_protos_dir}', '--proto_path=src'] + command.extend(PROTO_FILES) + subprocess.run(command, check=True, cwd='./') + build_ext.build_ext.run(self) + + +class PyBindIncludes(object): + """Returns the include paths for pybind11 when needed. + + To delay the invocation of "pybind11.get_include()" until it is available + in the environment. This lazy evaluation allows us to install it first, then + import it later to determine the correct include paths. + """ + + def __str__(self): + import pybind11 # pylint: disable=g-import-not-at-top + return pybind11.get_include() + + +MACOS = platform.system() == 'Darwin' +ext_modules = [ + setuptools.Extension( + 'gcld3.pybind_ext', + sorted(SRCS), + include_dirs=[ + PyBindIncludes(), + ], + libraries=['protobuf'], + extra_compile_args=['-std=c++11', '-stdlib=libc++'] if MACOS else [], + extra_link_args=['-stdlib=libc++'] if MACOS else [], + language='c++'), +] + +DESCRIPTION = """CLD3 is a neural network model for language identification. +This package contains the inference code and a trained model. See +https://github.com/google/cld3 for more details. +""" + +setuptools.setup( + author='Rami Al-Rfou', + author_email='rmyeid@google.com', + cmdclass={ + 'build_ext': CompileProtos, + }, + ext_modules=ext_modules, + packages=setuptools.find_packages(), + description='CLD3 is a neural network model for language identification.', + long_description=DESCRIPTION, + name=_NAME, + setup_requires=REQUIREMENTS, + url='https://github.com/google/cld3', + version=__version__, + zip_safe=False, +) diff --git a/chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h b/chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h index 33a71302bbc..004f903ea87 100644 --- a/chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h +++ b/chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h @@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room static const int kWithinScriptTail = 32; // Stop at word space in last // N bytes of script buffer -typedef struct { +struct LangSpan { char* text = nullptr; // Pointer to the span, somewhere int text_bytes = 0; // Number of bytes of text in the span int offset = 0; // Offset of start of span in original input buffer ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span bool truncated = false; // true if buffer filled up before a // different script or EOF was found -} LangSpan; +}; static inline bool IsContinuationByte(char c) { return static_cast<signed char>(c) < -64; diff --git a/chromium/third_party/cld_3/src/src/sentence_features.cc b/chromium/third_party/cld_3/src/src/sentence_features.cc index cae6e368c87..70d64f40ccf 100644 --- a/chromium/third_party/cld_3/src/src/sentence_features.cc +++ b/chromium/third_party/cld_3/src/src/sentence_features.cc @@ -19,11 +19,11 @@ limitations under the License. namespace chrome_lang_id { -// Declare registry for the whole Sentence feature functions. NOTE: this is not +// Define registry for the whole Sentence feature functions. NOTE: this is not // yet set to anything meaningful. It will be set so in NNetLanguageIdentifier // constructor, *before* we use any feature. template <> -WholeSentenceFeature::Registry - *RegisterableClass<WholeSentenceFeature>::registry_ = nullptr; +WholeSentenceFeature::Registry* + RegisterableClass<WholeSentenceFeature>::registry_ = nullptr; -} // namespace chrome_lang_id +} // namespace chrome_lang_id
\ No newline at end of file diff --git a/chromium/third_party/cld_3/src/src/sentence_features.h b/chromium/third_party/cld_3/src/src/sentence_features.h index 165bfd5f1a3..cc0be883305 100644 --- a/chromium/third_party/cld_3/src/src/sentence_features.h +++ b/chromium/third_party/cld_3/src/src/sentence_features.h @@ -26,9 +26,19 @@ limitations under the License. namespace chrome_lang_id { // Feature function that extracts features for the full Sentence. -typedef FeatureFunction<Sentence> WholeSentenceFeature; - -typedef FeatureExtractor<Sentence> WholeSentenceExtractor; +using WholeSentenceFeature = FeatureFunction<Sentence>; + +using WholeSentenceExtractor = FeatureExtractor<Sentence>; + +// Declare registry for the whole Sentence feature functions. This is required +// for clang's -Wundefined-var-template. However, MSVC has a bug which treats +// this declaration as a definition, leading to multiple definition errors, so +// omit this on MSVC. +#if !defined(COMPILER_MSVC) +template <> +WholeSentenceFeature::Registry + *RegisterableClass<WholeSentenceFeature>::registry_; +#endif } // namespace chrome_lang_id |