summaryrefslogtreecommitdiff
path: root/chromium/third_party/cld_3/src
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2021-10-26 13:57:00 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2021-11-02 11:31:01 +0000
commit1943b3c2a1dcee36c233724fc4ee7613d71b9cf6 (patch)
tree8c1b5f12357025c197da5427ae02cfdc2f3570d6 /chromium/third_party/cld_3/src
parent21ba0c5d4bf8fba15dddd97cd693bad2358b77fd (diff)
downloadqtwebengine-chromium-1943b3c2a1dcee36c233724fc4ee7613d71b9cf6.tar.gz
BASELINE: Update Chromium to 94.0.4606.111
Change-Id: I924781584def20fc800bedf6ff41fdb96c438193 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/third_party/cld_3/src')
-rw-r--r--chromium/third_party/cld_3/src/.github/workflows/main.yml139
-rw-r--r--chromium/third_party/cld_3/src/CMakeLists.txt2
-rw-r--r--chromium/third_party/cld_3/src/MANIFEST.in13
-rw-r--r--chromium/third_party/cld_3/src/README.md120
-rw-r--r--chromium/third_party/cld_3/src/gcld3/__init__.py1
-rw-r--r--chromium/third_party/cld_3/src/gcld3/pybind_ext.cc43
-rw-r--r--chromium/third_party/cld_3/src/requirements.txt3
-rw-r--r--chromium/third_party/cld_3/src/setup.py120
-rw-r--r--chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h4
-rw-r--r--chromium/third_party/cld_3/src/src/sentence_features.cc8
-rw-r--r--chromium/third_party/cld_3/src/src/sentence_features.h16
11 files changed, 458 insertions, 11 deletions
diff --git a/chromium/third_party/cld_3/src/.github/workflows/main.yml b/chromium/third_party/cld_3/src/.github/workflows/main.yml
new file mode 100644
index 00000000000..74e463a2f0b
--- /dev/null
+++ b/chromium/third_party/cld_3/src/.github/workflows/main.yml
@@ -0,0 +1,139 @@
+name: gcld3
+
+on: [push, pull_request]
+
+jobs:
+
+ test:
+ name: ${{ matrix.os }}-${{matrix.python-version}}-test
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest, macos-latest]
+ python-version: [3.6, 3.7, 3.8, pypy3]
+
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Linux Dependencies
+ if: runner.os == 'Linux'
+ run: sudo apt-get install libprotobuf-dev protobuf-compiler python3-dev
+
+ - name: MacOS Dependencies
+ if: runner.os == 'macOS'
+ run: brew install protobuf
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+ - name: Build package
+ run: |
+ pip install setuptools
+ python setup.py install
+
+ - name: Test with pytest
+ run: |
+ pip install pytest pytest-cov
+ pytest gcld3/tests/gcld3_test.py
+
+
+ sdist:
+ name: Build source distribution
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+
+ - uses: actions/setup-python@v2
+ name: Install Python
+ with:
+ python-version: "3.8"
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install setuptools wheel
+ - name: Build sdist
+ run: python setup.py sdist
+
+ - uses: actions/upload-artifact@v2
+ with:
+ path: dist/*.tar.gz
+
+ wheel:
+ name: ${{ matrix.os }}-wheel
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest, macos-latest]
+
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Set up Python 3.8
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.8
+
+ - name: Install cibuildwheel
+ run: |
+ python -m pip install cibuildwheel==1.5.5 auditwheel delocate
+
+ - name: Build
+ env:
+ CIBW_BUILD: "cp36-* cp38-* pp36-*"
+ CIBW_SKIP: "*-win32 *-manylinux_i686 pp27-* cp27-* cp35-*"
+ CIBW_BEFORE_BUILD_LINUX: yum -y install protobuf-devel protobuf-compiler python3-devel
+ CIBW_REPAIR_WHEEL_COMMAND_LINUX: "auditwheel repair --lib-sdir . -w {dest_dir} {wheel}"
+ CIBW_BEFORE_BUILD_MACOS: brew install protobuf
+ CIBW_REPAIR_WHEEL_COMMAND_MACOS: "delocate-listdeps {wheel} && delocate-wheel -w {dest_dir} -v {wheel}"
+ run: |
+ python -m cibuildwheel --output-dir wheelhouse
+
+ - uses: actions/upload-artifact@v2
+ with:
+ path: ./wheelhouse/*.whl
+
+
+ pypi:
+ needs: [wheel, sdist]
+ runs-on: ubuntu-latest
+ # upload to PyPI on every tag starting with 'v'
+ if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
+ # alternatively, to publish when a GitHub Release is created, use the following rule:
+ # if: github.event_name == 'release' && github.event.action == 'published'
+ steps:
+ - uses: actions/download-artifact@v2
+ with:
+ name: artifact
+ path: dist
+
+ - name: Set up Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: '3.8'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install twine
+
+ - name: Upload to test pypi
+ env:
+ TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
+ TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
+ run: |
+ twine upload --repository-url https://test.pypi.org/legacy/ dist/*
+
+ - name: Upload to pypi
+ env:
+ TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+ TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+ run: |
+ twine upload dist/*
diff --git a/chromium/third_party/cld_3/src/CMakeLists.txt b/chromium/third_party/cld_3/src/CMakeLists.txt
index 732a8ae67b1..2fa3908799c 100644
--- a/chromium/third_party/cld_3/src/CMakeLists.txt
+++ b/chromium/third_party/cld_3/src/CMakeLists.txt
@@ -24,7 +24,7 @@ add_definitions(-fPIC) # Position Independant Code
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
add_definitions(-std=c++11) # Needed for std::to_string(), ...
-include_directories(${CMAKE_CURRENT_BINARY_DIR}) # needed to include generated pb headers
+include_directories(${CMAKE_CURRENT_BINARY_DIR} ${Protobuf_INCLUDE_DIRS}) # needed to include generated pb headers
add_library(${PROJECT_NAME}
${PROTO_SRCS} ${PROTO_HDRS}
diff --git a/chromium/third_party/cld_3/src/MANIFEST.in b/chromium/third_party/cld_3/src/MANIFEST.in
new file mode 100644
index 00000000000..9fb3e4859cd
--- /dev/null
+++ b/chromium/third_party/cld_3/src/MANIFEST.in
@@ -0,0 +1,13 @@
+include LICENSE
+include README.md
+include requirements.txt
+global-include *h
+global-include *cc
+global-include *proto
+prune .github/
+prune .eggs/
+global-exclude *.pyc
+global-exclude *.cache
+global-exclude *.so
+exclude src/cld_3/protos/*h
+exclude src/cld_3/protos/*cc
diff --git a/chromium/third_party/cld_3/src/README.md b/chromium/third_party/cld_3/src/README.md
index 86008330c43..462dd8e60e3 100644
--- a/chromium/third_party/cld_3/src/README.md
+++ b/chromium/third_party/cld_3/src/README.md
@@ -1,8 +1,9 @@
# Compact Language Detector v3 (CLD3)
* [Model](#model)
+* [Supported Languages](#supported-languages)
* [Installation](#installation)
-* [Contact](#contact)
+* [Bugs and Feature Requests](#bugs-and-feature-requests)
* [Credits](#credits)
### Model
@@ -26,6 +27,123 @@ To get a language prediction for the input text, we simply perform a forward
![Figure](model.png "CLD3")
+### Supported Languages
+
+The model outputs BCP-47-style language codes, shown in the table below. For
+some languages, output is differentiated by script. Language and script names
+from
+[Unicode CLDR](https://github.com/unicode-cldr/cldr-localenames-modern/blob/master/main/en).
+
+Output Code | Language Name | Script Name
+----------- | --------------- | ------------------------------------------
+af | Afrikaans | Latin
+am | Amharic | Ethiopic
+ar | Arabic | Arabic
+bg | Bulgarian | Cyrillic
+bg-Latn | Bulgarian | Latin
+bn | Bangla | Bangla
+bs | Bosnian | Latin
+ca | Catalan | Latin
+ceb | Cebuano | Latin
+co | Corsican | Latin
+cs | Czech | Latin
+cy | Welsh | Latin
+da | Danish | Latin
+de | German | Latin
+el | Greek | Greek
+el-Latn | Greek | Latin
+en | English | Latin
+eo | Esperanto | Latin
+es | Spanish | Latin
+et | Estonian | Latin
+eu | Basque | Latin
+fa | Persian | Arabic
+fi | Finnish | Latin
+fil | Filipino | Latin
+fr | French | Latin
+fy | Western Frisian | Latin
+ga | Irish | Latin
+gd | Scottish Gaelic | Latin
+gl | Galician | Latin
+gu | Gujarati | Gujarati
+ha | Hausa | Latin
+haw | Hawaiian | Latin
+hi | Hindi | Devanagari
+hi-Latn | Hindi | Latin
+hmn | Hmong | Latin
+hr | Croatian | Latin
+ht | Haitian Creole | Latin
+hu | Hungarian | Latin
+hy | Armenian | Armenian
+id | Indonesian | Latin
+ig | Igbo | Latin
+is | Icelandic | Latin
+it | Italian | Latin
+iw | Hebrew | Hebrew
+ja | Japanese | Japanese
+ja-Latn | Japanese | Latin
+jv | Javanese | Latin
+ka | Georgian | Georgian
+kk | Kazakh | Cyrillic
+km | Khmer | Khmer
+kn | Kannada | Kannada
+ko | Korean | Korean
+ku | Kurdish | Latin
+ky | Kyrgyz | Cyrillic
+la | Latin | Latin
+lb | Luxembourgish | Latin
+lo | Lao | Lao
+lt | Lithuanian | Latin
+lv | Latvian | Latin
+mg | Malagasy | Latin
+mi | Maori | Latin
+mk | Macedonian | Cyrillic
+ml | Malayalam | Malayalam
+mn | Mongolian | Cyrillic
+mr | Marathi | Devanagari
+ms | Malay | Latin
+mt | Maltese | Latin
+my | Burmese | Myanmar
+ne | Nepali | Devanagari
+nl | Dutch | Latin
+no | Norwegian | Latin
+ny | Nyanja | Latin
+pa | Punjabi | Gurmukhi
+pl | Polish | Latin
+ps | Pashto | Arabic
+pt | Portuguese | Latin
+ro | Romanian | Latin
+ru | Russian | Cyrillic
+ru-Latn | Russian | English
+sd | Sindhi | Arabic
+si | Sinhala | Sinhala
+sk | Slovak | Latin
+sl | Slovenian | Latin
+sm | Samoan | Latin
+sn | Shona | Latin
+so | Somali | Latin
+sq | Albanian | Latin
+sr | Serbian | Cyrillic
+st | Southern Sotho | Latin
+su | Sundanese | Latin
+sv | Swedish | Latin
+sw | Swahili | Latin
+ta | Tamil | Tamil
+te | Telugu | Telugu
+tg | Tajik | Cyrillic
+th | Thai | Thai
+tr | Turkish | Latin
+uk | Ukrainian | Cyrillic
+ur | Urdu | Arabic
+uz | Uzbek | Latin
+vi | Vietnamese | Latin
+xh | Xhosa | Latin
+yi | Yiddish | Hebrew
+yo | Yoruba | Latin
+zh | Chinese | Han (including Simplified and Traditional)
+zh-Latn | Chinese | Latin
+zu | Zulu | Latin
+
### Installation
CLD3 is designed to run in the Chrome browser, so it relies on code in
[Chromium](http://www.chromium.org/).
diff --git a/chromium/third_party/cld_3/src/gcld3/__init__.py b/chromium/third_party/cld_3/src/gcld3/__init__.py
new file mode 100644
index 00000000000..b7d4dcc3f75
--- /dev/null
+++ b/chromium/third_party/cld_3/src/gcld3/__init__.py
@@ -0,0 +1 @@
+from .pybind_ext import *
diff --git a/chromium/third_party/cld_3/src/gcld3/pybind_ext.cc b/chromium/third_party/cld_3/src/gcld3/pybind_ext.cc
new file mode 100644
index 00000000000..024d6d56d89
--- /dev/null
+++ b/chromium/third_party/cld_3/src/gcld3/pybind_ext.cc
@@ -0,0 +1,43 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <pybind11/stl.h>
+
+#include "../src/nnet_language_identifier.h"
+
+namespace pybind11 {
+
+using chrome_lang_id::NNetLanguageIdentifier;
+
+// This is conventional.
+namespace py = pybind11;
+
+PYBIND11_MODULE(pybind_ext, py_module) {
+ py::class_<NNetLanguageIdentifier>(py_module, "NNetLanguageIdentifier")
+ .def(py::init<const int, const int>(), py::arg("min_num_bytes"),
+ py::arg("max_num_bytes"))
+ .def("FindLanguage", &NNetLanguageIdentifier::FindLanguage,
+ py::arg("text"))
+ .def("FindTopNMostFreqLangs",
+ &NNetLanguageIdentifier::FindTopNMostFreqLangs, py::arg("text"),
+ py::arg("num_langs"))
+ .def_readonly_static("kUnknown", &NNetLanguageIdentifier::kUnknown)
+ .def_readonly_static("kMinNumBytesToConsider",
+ &NNetLanguageIdentifier::kMinNumBytesToConsider)
+ .def_readonly_static("kMaxNumBytesToConsider",
+ &NNetLanguageIdentifier::kMaxNumBytesToConsider)
+ .def_readonly_static("kMaxNumInputBytesToConsider",
+ &NNetLanguageIdentifier::kMaxNumInputBytesToConsider)
+ .def_readonly_static("kReliabilityThreshold",
+ &NNetLanguageIdentifier::kReliabilityThreshold)
+ .def_readonly_static("kReliabilityHrBsThreshold",
+ &NNetLanguageIdentifier::kReliabilityHrBsThreshold);
+
+ py::class_<NNetLanguageIdentifier::Result>(py_module, "Result")
+ .def_readwrite("language", &NNetLanguageIdentifier::Result::language)
+ .def_readwrite("probability",
+ &NNetLanguageIdentifier::Result::probability)
+ .def_readwrite("is_reliable",
+ &NNetLanguageIdentifier::Result::is_reliable)
+ .def_readwrite("proportion", &NNetLanguageIdentifier::Result::proportion);
+}
+} // namespace pybind11
diff --git a/chromium/third_party/cld_3/src/requirements.txt b/chromium/third_party/cld_3/src/requirements.txt
new file mode 100644
index 00000000000..8d0ba11fa12
--- /dev/null
+++ b/chromium/third_party/cld_3/src/requirements.txt
@@ -0,0 +1,3 @@
+protobuf >=3.0.0
+pybind11 >=2.5.0
+wheel >= 0.34.2
diff --git a/chromium/third_party/cld_3/src/setup.py b/chromium/third_party/cld_3/src/setup.py
new file mode 100644
index 00000000000..385189fc99e
--- /dev/null
+++ b/chromium/third_party/cld_3/src/setup.py
@@ -0,0 +1,120 @@
+"""Setup utility for gcld3."""
+
+import os
+import platform
+import shutil
+import subprocess
+import setuptools
+from setuptools.command import build_ext
+
+__version__ = '3.0.13'
+_NAME = 'gcld3'
+
+REQUIREMENTS = ['pybind11 >= 2.5.0', 'wheel >= 0.34.2']
+
+PROTO_FILES = [
+ 'src/feature_extractor.proto',
+ 'src/sentence.proto',
+ 'src/task_spec.proto',
+]
+
+SRCS = [
+ 'src/base.cc',
+ 'src/embedding_feature_extractor.cc',
+ 'src/embedding_network.cc',
+ 'src/feature_extractor.cc',
+ 'src/feature_types.cc',
+ 'src/fml_parser.cc',
+ 'src/lang_id_nn_params.cc',
+ 'src/language_identifier_features.cc',
+ 'src/language_identifier_main.cc',
+ 'src/nnet_language_identifier.cc',
+ 'src/registry.cc',
+ 'src/relevant_script_feature.cc',
+ 'src/sentence_features.cc',
+ 'src/task_context.cc',
+ 'src/task_context_params.cc',
+ 'src/unicodetext.cc',
+ 'src/utils.cc',
+ 'src/workspace.cc',
+ 'src/script_span/fixunicodevalue.cc',
+ 'src/script_span/generated_entities.cc',
+ 'src/script_span/generated_ulscript.cc',
+ 'src/script_span/getonescriptspan.cc',
+ 'src/script_span/offsetmap.cc',
+ 'src/script_span/text_processing.cc',
+ 'src/script_span/utf8statetable.cc',
+ # These CC files have to be generated by the proto buffer compiler 'protoc'
+ 'src/cld_3/protos/feature_extractor.pb.cc',
+ 'src/cld_3/protos/sentence.pb.cc',
+ 'src/cld_3/protos/task_spec.pb.cc',
+ # pybind11 bindings
+ 'gcld3/pybind_ext.cc',
+]
+
+
+class CompileProtos(build_ext.build_ext):
+ """Compile protocol buffers via `protoc` compiler."""
+
+ def run(self):
+ if shutil.which('protoc') is None:
+ raise RuntimeError('Please install the proto buffer compiler.')
+
+ # The C++ code expect the protos to be compiled under the following
+ # directory, therefore, create it if necessary.
+ compiled_protos_dir = 'src/cld_3/protos/'
+ os.makedirs(compiled_protos_dir, exist_ok=True)
+ command = ['protoc', f'--cpp_out={compiled_protos_dir}', '--proto_path=src']
+ command.extend(PROTO_FILES)
+ subprocess.run(command, check=True, cwd='./')
+ build_ext.build_ext.run(self)
+
+
+class PyBindIncludes(object):
+ """Returns the include paths for pybind11 when needed.
+
+ To delay the invocation of "pybind11.get_include()" until it is available
+ in the environment. This lazy evaluation allows us to install it first, then
+ import it later to determine the correct include paths.
+ """
+
+ def __str__(self):
+ import pybind11 # pylint: disable=g-import-not-at-top
+ return pybind11.get_include()
+
+
+MACOS = platform.system() == 'Darwin'
+ext_modules = [
+ setuptools.Extension(
+ 'gcld3.pybind_ext',
+ sorted(SRCS),
+ include_dirs=[
+ PyBindIncludes(),
+ ],
+ libraries=['protobuf'],
+ extra_compile_args=['-std=c++11', '-stdlib=libc++'] if MACOS else [],
+ extra_link_args=['-stdlib=libc++'] if MACOS else [],
+ language='c++'),
+]
+
+DESCRIPTION = """CLD3 is a neural network model for language identification.
+This package contains the inference code and a trained model. See
+https://github.com/google/cld3 for more details.
+"""
+
+setuptools.setup(
+ author='Rami Al-Rfou',
+ author_email='rmyeid@google.com',
+ cmdclass={
+ 'build_ext': CompileProtos,
+ },
+ ext_modules=ext_modules,
+ packages=setuptools.find_packages(),
+ description='CLD3 is a neural network model for language identification.',
+ long_description=DESCRIPTION,
+ name=_NAME,
+ setup_requires=REQUIREMENTS,
+ url='https://github.com/google/cld3',
+ version=__version__,
+ zip_safe=False,
+)
diff --git a/chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h b/chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h
index 33a71302bbc..004f903ea87 100644
--- a/chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h
+++ b/chromium/third_party/cld_3/src/src/script_span/getonescriptspan.h
@@ -33,14 +33,14 @@ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
static const int kWithinScriptTail = 32; // Stop at word space in last
// N bytes of script buffer
-typedef struct {
+struct LangSpan {
char* text = nullptr; // Pointer to the span, somewhere
int text_bytes = 0; // Number of bytes of text in the span
int offset = 0; // Offset of start of span in original input buffer
ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
bool truncated = false; // true if buffer filled up before a
// different script or EOF was found
-} LangSpan;
+};
static inline bool IsContinuationByte(char c) {
return static_cast<signed char>(c) < -64;
diff --git a/chromium/third_party/cld_3/src/src/sentence_features.cc b/chromium/third_party/cld_3/src/src/sentence_features.cc
index cae6e368c87..70d64f40ccf 100644
--- a/chromium/third_party/cld_3/src/src/sentence_features.cc
+++ b/chromium/third_party/cld_3/src/src/sentence_features.cc
@@ -19,11 +19,11 @@ limitations under the License.
namespace chrome_lang_id {
-// Declare registry for the whole Sentence feature functions. NOTE: this is not
+// Define registry for the whole Sentence feature functions. NOTE: this is not
// yet set to anything meaningful. It will be set so in NNetLanguageIdentifier
// constructor, *before* we use any feature.
template <>
-WholeSentenceFeature::Registry
- *RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
+WholeSentenceFeature::Registry*
+ RegisterableClass<WholeSentenceFeature>::registry_ = nullptr;
-} // namespace chrome_lang_id
+} // namespace chrome_lang_id \ No newline at end of file
diff --git a/chromium/third_party/cld_3/src/src/sentence_features.h b/chromium/third_party/cld_3/src/src/sentence_features.h
index 165bfd5f1a3..cc0be883305 100644
--- a/chromium/third_party/cld_3/src/src/sentence_features.h
+++ b/chromium/third_party/cld_3/src/src/sentence_features.h
@@ -26,9 +26,19 @@ limitations under the License.
namespace chrome_lang_id {
// Feature function that extracts features for the full Sentence.
-typedef FeatureFunction<Sentence> WholeSentenceFeature;
-
-typedef FeatureExtractor<Sentence> WholeSentenceExtractor;
+using WholeSentenceFeature = FeatureFunction<Sentence>;
+
+using WholeSentenceExtractor = FeatureExtractor<Sentence>;
+
+// Declare registry for the whole Sentence feature functions. This is required
+// for clang's -Wundefined-var-template. However, MSVC has a bug which treats
+// this declaration as a definition, leading to multiple definition errors, so
+// omit this on MSVC.
+#if !defined(COMPILER_MSVC)
+template <>
+WholeSentenceFeature::Registry
+ *RegisterableClass<WholeSentenceFeature>::registry_;
+#endif
} // namespace chrome_lang_id