summaryrefslogtreecommitdiff
path: root/chromium/third_party/cld_3/src/src/relevant_script_feature.h
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/cld_3/src/src/relevant_script_feature.h')
-rw-r--r--chromium/third_party/cld_3/src/src/relevant_script_feature.h49
1 files changed, 49 insertions, 0 deletions
diff --git a/chromium/third_party/cld_3/src/src/relevant_script_feature.h b/chromium/third_party/cld_3/src/src/relevant_script_feature.h
new file mode 100644
index 00000000000..ce808105769
--- /dev/null
+++ b/chromium/third_party/cld_3/src/src/relevant_script_feature.h
@@ -0,0 +1,49 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef RELEVANT_SCRIPT_FEATURE_H_
+#define RELEVANT_SCRIPT_FEATURE_H_
+
+#include "feature_extractor.h"
+#include "cld_3/protos/sentence.pb.h"
+#include "sentence_features.h"
+#include "task_context.h"
+#include "workspace.h"
+
+namespace chrome_lang_id {
+
+// Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode
+// script (see below): each such feature indicates the script and the ratio of
+// UTF8 characters in that script, in the given sentence.
+//
+// What is a relevant script? Recognizing all 100+ Unicode scripts would
+// require too much code size and runtime. Instead, we focus only on a few
+// scripts that communicate a lot of language information: e.g., the use of
+// Hiragana characters almost always indicates Japanese, so Hiragana is a
+// "relevant" script for us. The Latin script is used by dozens of language, so
+// Latin is not relevant in this context.
+class RelevantScriptFeature : public WholeSentenceFeature {
+ public:
+ void Setup(TaskContext *context) override;
+ void Init(TaskContext *context) override;
+
+ // Appends the features computed from the sentence to the feature vector.
+ void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
+ FeatureVector *result) const override;
+};
+
+} // namespace chrome_lang_id
+
+#endif // RELEVANT_SCRIPT_FEATURE_H_