summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatias Larsson <matias.larsson@matthews.se>2022-11-24 18:22:55 +0100
committerPeng Wu <alexepico@gmail.com>2022-11-29 16:16:21 +0800
commit8bcea4710bd328ef74dc852d9fdffb3c47ed8abe (patch)
tree88a92418b2408d31aeb5ae21d04a5ae1c4ce4a8d
parent521340f02907dae96f6f13dbf7997a34ebe864f2 (diff)
downloadlibpinyin-8bcea4710bd328ef74dc852d9fdffb3c47ed8abe.tar.gz
Fix libpinyin crash on ARMv7
Fix the alignment trap in get_unigram_frequency(). Fix also other places where this same trap could happen (depending on compiler and surrounding code). The trap happened when the ARM GCC generated a SIMD instruction (specifically VLDR) to load 32 bits in a single instruction, and when the memory address was not aligned to 32 bits. VLDR traps if the address is not aligned. GCC generated the instruction because of the cast to uint32 from the address. The fix is to allocate a uint32 variable in stack and use memcpy to copy the data to that variable. This way we ensure that appropriate instructions are generated. **Links** About the issue with GCC: https://trust-in-soft.com/blog/2020/04/06/gcc-always-assumes-aligned-pointer-accesses/ How Linux does it: https://elixir.bootlin.com/linux/v5.10.155/source/include/linux/unaligned/memmove.h#L13 ARM documentation: https://documentation-service.arm.com/static/5f8dc043f86e16515cdbbc92?token= See 'A3.2.1 Unaligned data access'
-rw-r--r--src/include/memory_chunk.h34
-rw-r--r--src/include/unaligned_memory.h43
-rw-r--r--src/storage/chewing_large_table.cpp2
-rw-r--r--src/storage/ngram.cpp6
-rw-r--r--src/storage/phrase_index.cpp23
-rw-r--r--src/storage/phrase_index.h13
-rw-r--r--src/storage/phrase_large_table2.cpp3
7 files changed, 104 insertions, 20 deletions
diff --git a/src/include/memory_chunk.h b/src/include/memory_chunk.h
index c106dba..baa5679 100644
--- a/src/include/memory_chunk.h
+++ b/src/include/memory_chunk.h
@@ -290,6 +290,25 @@ public:
}
/**
+ * MemoryChunk::set_content:
+ * @offset: the offset in this MemoryChunk.
+ * @data: the data to be copied.
+ * @returns: whether the data is copied successfully.
+ *
+ * Data are written directly to the memory area in this MemoryChunk.
+ *
+ */
+ template <typename T>
+ bool set_content(size_t offset, T data){
+ const size_t len = sizeof(data);
+ size_t cursize = std_lite::max(size(), offset + len);
+ ensure_has_space(offset + len);
+ memmove(m_data_begin + offset, &data, len);
+ m_data_end = m_data_begin + cursize;
+ return true;
+ }
+
+ /**
* MemoryChunk::append_content:
* @data: the begin of the data to be copied.
* @len: the length of the data to be copied.
@@ -357,6 +376,21 @@ public:
}
/**
+ * MemoryChunk::get_content:
+ * @offset: the offset in this MemoryChunk.
+ * @returns: the content
+ *
+ * Get the content in this MemoryChunk.
+ *
+ */
+ template <typename T>
+ T get_content(size_t offset) const {
+ T value;
+ memcpy(&value, m_data_begin + offset, sizeof(value));
+ return value;
+ }
+
+ /**
* MemoryChunk::compact_memory:
*
* Compact memory, reduce the size.
diff --git a/src/include/unaligned_memory.h b/src/include/unaligned_memory.h
new file mode 100644
index 0000000..27b2f19
--- /dev/null
+++ b/src/include/unaligned_memory.h
@@ -0,0 +1,43 @@
+#ifndef UNALIGNED_MEMORY_H
+#define UNALIGNED_MEMORY_H
+
+#include <cstring>
+
+/**
+ * UnalignedMemory: Safe unaligned memory access.
+ *
+ * Some instruction sets, or some instructions in some instruction sets
+ * require that memory access is aligned to a specific boundary. These
+ * instructions may trap on unaligned access.
+ *
+ * This class provides methods to load and store values at unaligned
+ * addresses. It ensures that the compiler doesn't generate instructions
+ * that could trap on the unaligned memory access.
+ */
+
+namespace pinyin{
+ template <typename T>
+ class UnalignedMemory{
+ public:
+ /**
+ * Read a value from a possibly unaligned memory address.
+ *
+ */
+ static T load(const void * src) {
+ T value;
+ memcpy(&value, src, sizeof(T));
+ return value;
+ }
+
+ /**
+ * Store a value into a possibly unaligned memory address.
+ *
+ */
+ static void store(T value, void * dest) {
+ memcpy(dest, &value, sizeof(T));
+ }
+ };
+};
+
+
+#endif
diff --git a/src/storage/chewing_large_table.cpp b/src/storage/chewing_large_table.cpp
index e856caf..bd76e9b 100644
--- a/src/storage/chewing_large_table.cpp
+++ b/src/storage/chewing_large_table.cpp
@@ -804,7 +804,7 @@ bool ChewingBitmapIndexLevel::store(MemoryChunk * new_chunk,
bool ChewingLengthIndexLevel::load(MemoryChunk * chunk, table_offset_t offset,
table_offset_t end) {
char * begin = (char *) chunk->begin();
- guint32 nindex = *((guint32 *)(begin + offset)); /* number of index */
+ guint32 nindex = chunk->get_content<guint32>(offset); /* number of index */
table_offset_t * index = (table_offset_t *)
(begin + offset + sizeof(guint32));
diff --git a/src/storage/ngram.cpp b/src/storage/ngram.cpp
index c952768..e4bfe8f 100644
--- a/src/storage/ngram.cpp
+++ b/src/storage/ngram.cpp
@@ -46,14 +46,12 @@ SingleGram::SingleGram(void * buffer, size_t length, bool copy){
}
bool SingleGram::get_total_freq(guint32 & total) const{
- char * buf_begin = (char *)m_chunk.begin();
- total = *((guint32 *)buf_begin);
+ total = m_chunk.get_content<guint32>(0);
return true;
}
bool SingleGram::set_total_freq(guint32 total){
- char * buf_begin = (char *)m_chunk.begin();
- *((guint32 *)buf_begin) = total;
+ m_chunk.set_content<guint32>(0, total);
return true;
}
diff --git a/src/storage/phrase_index.cpp b/src/storage/phrase_index.cpp
index e9e9d5d..de864c2 100644
--- a/src/storage/phrase_index.cpp
+++ b/src/storage/phrase_index.cpp
@@ -20,6 +20,7 @@
#include "phrase_index.h"
#include "pinyin_custom2.h"
+#include "unaligned_memory.h"
namespace pinyin{
@@ -61,10 +62,12 @@ bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){
for (int i = 0; i < npron; ++i) {
char * chewing_begin = buf_begin + offset +
i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
- guint32 * freq = (guint32 *)(chewing_begin +
- phrase_length * sizeof(ChewingKey));
+
+ guint32 * pfreq = (guint32 *)(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+ guint32 freq = UnalignedMemory<guint32>::load(pfreq);
- total_freq += *freq;
+ total_freq += freq;
if (0 == pinyin_exact_compare2
(keys, (ChewingKey *)chewing_begin, phrase_length)) {
@@ -74,8 +77,9 @@ bool PhraseItem::add_pronunciation(ChewingKey * keys, guint32 delta){
if (delta > 0 && total_freq > total_freq + delta)
return false;
- *freq += delta;
+ freq += delta;
total_freq += delta;
+ UnalignedMemory<guint32>::store(freq, pfreq);
return true;
}
}
@@ -117,9 +121,11 @@ void PhraseItem::increase_pronunciation_possibility(ChewingKey * keys,
for (int i = 0; i < npron; ++i) {
char * chewing_begin = buf_begin + offset +
i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
- guint32 * freq = (guint32 *)(chewing_begin +
- phrase_length * sizeof(ChewingKey));
- total_freq += *freq;
+
+ guint32 * pfreq = (guint32 *)(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+ guint32 freq = UnalignedMemory<guint32>::load(pfreq);
+ total_freq += freq;
if (0 == pinyin_compare_with_tones(keys, (ChewingKey *)chewing_begin,
phrase_length)) {
@@ -128,8 +134,9 @@ void PhraseItem::increase_pronunciation_possibility(ChewingKey * keys,
if (delta > 0 && total_freq > total_freq + delta)
return;
- *freq += delta;
+ freq += delta;
total_freq += delta;
+ UnalignedMemory<guint32>::store(freq, pfreq);
}
}
}
diff --git a/src/storage/phrase_index.h b/src/storage/phrase_index.h
index 83dfb51..7e910ed 100644
--- a/src/storage/phrase_index.h
+++ b/src/storage/phrase_index.h
@@ -31,6 +31,7 @@
#include "memory_chunk.h"
#include "phrase_index_logger.h"
#include "table_info.h"
+#include "unaligned_memory.h"
/**
* Phrase Index File Format
@@ -121,8 +122,7 @@ public:
*
*/
guint32 get_unigram_frequency(){
- char * buf_begin = (char *)m_chunk.begin();
- return (*(guint32 *)(buf_begin + sizeof(guint8) + sizeof(guint8)));
+ return m_chunk.get_content<guint32>(sizeof(guint8) + sizeof(guint8));
}
/**
@@ -142,12 +142,13 @@ public:
for ( int i = 0 ; i < npron ; ++i){
char * chewing_begin = buf_begin + offset +
i * (phrase_length * sizeof(ChewingKey) + sizeof(guint32));
- guint32 * freq = (guint32 *)(chewing_begin +
- phrase_length * sizeof(ChewingKey));
- total_freq += *freq;
+
+ guint32 freq = UnalignedMemory<guint32>::load(chewing_begin +
+ phrase_length * sizeof(ChewingKey));
+ total_freq += freq;
if ( 0 == pinyin_compare_with_tones(keys, (ChewingKey *)chewing_begin,
phrase_length) ){
- matched += *freq;
+ matched += freq;
}
}
diff --git a/src/storage/phrase_large_table2.cpp b/src/storage/phrase_large_table2.cpp
index b72917c..28eb313 100644
--- a/src/storage/phrase_large_table2.cpp
+++ b/src/storage/phrase_large_table2.cpp
@@ -561,7 +561,8 @@ bool PhraseLengthIndexLevel2::load(MemoryChunk * chunk,
table_offset_t offset,
table_offset_t end) {
char * buf_begin = (char *) chunk->begin();
- guint32 nindex = *((guint32 *)(buf_begin + offset));
+ guint32 nindex = chunk->get_content<guint32>(offset);
+
table_offset_t * index = (table_offset_t *)
(buf_begin + offset + sizeof(guint32));