From 0bc443e3b442b49cb6989282601d477c673c4412 Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorickpeterse@gmail.com>
Date: Mon, 12 Sep 2016 17:25:35 +0200
Subject: Handle encoding in non-binary Blob instances

gitlab_git 10.6.4 relies on Rugged marking blobs as binary or not,
instead of relying on Linguist. Linguist in turn would mark text blobs
as binary whenever they would contain byte sequences that could not be
encoded using UTF-8.

However, marking such blobs as binary is not correct. If one pushes a
Markdown document with invalid character sequences it's still a text
based Markdown document and not some random binary blob.

This commit overwrites Blob#data so it automatically converts text-based
content to UTF-8 (the encoding we use everywhere else) while taking care
of replacing any invalid sequences with the UTF-8 replacement character.
The data of binary blobs is left as-is.
---
 spec/models/blob_spec.rb | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'spec/models/blob_spec.rb')

diff --git a/spec/models/blob_spec.rb b/spec/models/blob_spec.rb
index cee20234e1f..03d02b4d382 100644
--- a/spec/models/blob_spec.rb
+++ b/spec/models/blob_spec.rb
@@ -1,3 +1,4 @@
+# encoding: utf-8
 require 'rails_helper'
 
 describe Blob do
@@ -7,6 +8,25 @@ describe Blob do
     end
   end
 
+  describe '#data' do
+    context 'using a binary blob' do
+      it 'returns the data as-is' do
+        data = "\n\xFF\xB9\xC3"
+        blob = described_class.new(double(binary?: true, data: data))
+
+        expect(blob.data).to eq(data)
+      end
+    end
+
+    context 'using a text blob' do
+      it 'converts the data to UTF-8' do
+        blob = described_class.new(double(binary?: false, data: "\n\xFF\xB9\xC3"))
+
+        expect(blob.data).to eq("\n���")
+      end
+    end
+  end
+
   describe '#svg?' do
     it 'is falsey when not text' do
       git_blob = double(text?: false)
-- 
cgit v1.2.1