Merge branch 'zj-remove-linguist' into 'master'

Remove dependencies on Linguist Closes #35450 See merge request gitlab-org/gitlab-ce!21008
author: Robert Speicher <robert@gitlab.com> 2018-10-15 15:27:15 +0000
committer: Robert Speicher <robert@gitlab.com> 2018-10-15 15:27:15 +0000
commit: f5d088eb11b066dadad06948e158b39e9432573c (patch)
tree: 9f1db7d7ba83f1b221bdb88b864e8efca1fd95d8 /lib
parent: 201143e9db57ed6f4cd72704387777b4be5ecc34 (diff)
parent: 399056ed783e12337a9c47b06b4aae021198f1cd (diff)
download: gitlab-ce-f5d088eb11b066dadad06948e158b39e9432573c.tar.gz
5 files changed, 182 insertions, 39 deletions
diff --git a/lib/gitlab/blob_helper.rb b/lib/gitlab/blob_helper.rb
new file mode 100644
index 00000000000..9b3b383b0c8
--- /dev/null
+++ b/lib/gitlab/blob_helper.rb
@@ -0,0 +1,145 @@
+# This has been extracted from https://github.com/github/linguist/blob/master/lib/linguist/blob_helper.rb
+module Gitlab
+  module BlobHelper
+    def extname
+      File.extname(name.to_s)
+    end
+
+    def known_extension?
+      LanguageData.extensions.include?(extname)
+    end
+
+    def viewable?
+      !large? && text?
+    end
+
+    MEGABYTE = 1024 * 1024
+
+    def large?
+      size.to_i > MEGABYTE
+    end
+
+    def binary?
+      # Large blobs aren't even loaded into memory
+      if data.nil?
+        true
+
+      # Treat blank files as text
+      elsif data == ""
+        false
+
+      # Charlock doesn't know what to think
+      elsif encoding.nil?
+        true
+
+      # If Charlock says its binary
+      else
+        detect_encoding[:type] == :binary
+      end
+    end
+
+    def text?
+      !binary?
+    end
+
+    def image?
+      ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
+    end
+
+    # Internal: Lookup mime type for extension.
+    #
+    # Returns a MIME::Type
+    # rubocop:disable Gitlab/ModuleWithInstanceVariables
+    def _mime_type
+      if defined? @_mime_type
+        @_mime_type
+      else
+        guesses = ::MIME::Types.type_for(extname.to_s)
+
+        # Prefer text mime types over binary
+        @_mime_type = guesses.detect { |type| type.ascii? } || guesses.first
+      end
+    end
+    # rubocop:enable Gitlab/ModuleWithInstanceVariables
+
+    # Public: Get the actual blob mime type
+    #
+    # Examples
+    #
+    #   # => 'text/plain'
+    #   # => 'text/html'
+    #
+    # Returns a mime type String.
+    def mime_type
+      _mime_type ? _mime_type.to_s : 'text/plain'
+    end
+
+    def binary_mime_type?
+      _mime_type ? _mime_type.binary? : false
+    end
+
+    def lines
+      @lines ||=
+        if viewable? && data
+          # `data` is usually encoded as ASCII-8BIT even when the content has
+          # been detected as a different encoding. However, we are not allowed
+          # to change the encoding of `data` because we've made the implicit
+          # guarantee that each entry in `lines` is encoded the same way as
+          # `data`.
+          #
+          # Instead, we re-encode each possible newline sequence as the
+          # detected encoding, then force them back to the encoding of `data`
+          # (usually a binary encoding like ASCII-8BIT). This means that the
+          # byte sequence will match how newlines are likely encoded in the
+          # file, but we don't have to change the encoding of `data` as far as
+          # Ruby is concerned. This allows us to correctly parse out each line
+          # without changing the encoding of `data`, and
+          # also--importantly--without having to duplicate many (potentially
+          # large) strings.
+          begin
+            data.split(encoded_newlines_re, -1)
+          rescue Encoding::ConverterNotFoundError
+            # The data is not splittable in the detected encoding.  Assume it's
+            # one big line.
+            [data]
+          end
+        else
+          []
+        end
+    end
+
+    def content_type
+      # rubocop:disable Style/MultilineTernaryOperator
+      # rubocop:disable Style/NestedTernaryOperator
+      @content_type ||= binary_mime_type? || binary? ? mime_type :
+                          (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
+      # rubocop:enable Style/NestedTernaryOperator
+      # rubocop:enable Style/MultilineTernaryOperator
+    end
+
+    def encoded_newlines_re
+      @encoded_newlines_re ||=
+        Regexp.union(["\r\n", "\r", "\n"].map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) })
+    end
+
+    def ruby_encoding
+      if hash = detect_encoding
+        hash[:ruby_encoding]
+      end
+    end
+
+    def encoding
+      if hash = detect_encoding
+        hash[:encoding]
+      end
+    end
+
+    def detect_encoding
+      @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
+    end
+
+    def empty?
+      data.nil? || data == ""
+    end
+  end
+end
diff --git a/lib/gitlab/conflict/file.rb b/lib/gitlab/conflict/file.rb
index 2a0cb640a14..30911b49b18 100644
--- a/lib/gitlab/conflict/file.rb
+++ b/lib/gitlab/conflict/file.rb
@@ -158,7 +158,6 @@ module Gitlab
         json_hash.tap do |json_hash|
           if opts[:full_content]
             json_hash[:content] = content
-            json_hash[:blob_ace_mode] = our_blob && our_blob.language.try(:ace_mode)
           else
             json_hash[:sections] = sections if type.text?
             json_hash[:type] = type
diff --git a/lib/gitlab/git/blob.rb b/lib/gitlab/git/blob.rb
index 71857bd2d87..13b0bb930f4 100644
--- a/lib/gitlab/git/blob.rb
+++ b/lib/gitlab/git/blob.rb
@@ -3,13 +3,13 @@
 module Gitlab
   module Git
     class Blob
-      include Linguist::BlobHelper
+      include Gitlab::BlobHelper
       include Gitlab::EncodingHelper
 
       # This number is the maximum amount of data that we want to display to
-      # the user. We load as much as we can for encoding detection
-      # (Linguist) and LFS pointer parsing. All other cases where we need full
-      # blob data should use load_all_data!.
+      # the user. We load as much as we can for encoding detection and LFS
+      # pointer parsing. All other cases where we need full blob data should
+      # use load_all_data!.
       MAX_DATA_DISPLAY_SIZE = 10.megabytes
 
       # These limits are used as a heuristic to ignore files which can't be LFS
diff --git a/lib/gitlab/git/blob_snippet.rb b/lib/gitlab/git/blob_snippet.rb
deleted file mode 100644
index 68116e775c6..00000000000
--- a/lib/gitlab/git/blob_snippet.rb
+++ /dev/null
@@ -1,34 +0,0 @@
-# Gitaly note: JV: no RPC's here.
-
-module Gitlab
-  module Git
-    class BlobSnippet
-      include Linguist::BlobHelper
-
-      attr_accessor :ref
-      attr_accessor :lines
-      attr_accessor :filename
-      attr_accessor :startline
-
-      def initialize(ref, lines, startline, filename)
-        @ref, @lines, @startline, @filename = ref, lines, startline, filename
-      end
-
-      def data
-        lines&.join("\n")
-      end
-
-      def name
-        filename
-      end
-
-      def size
-        data.length
-      end
-
-      def mode
-        nil
-      end
-    end
-  end
-end
diff --git a/lib/gitlab/language_data.rb b/lib/gitlab/language_data.rb
new file mode 100644
index 00000000000..bfdd7175198
--- /dev/null
+++ b/lib/gitlab/language_data.rb
@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module LanguageData
+    EXTENSION_MUTEX = Mutex.new
+
+    class << self
+      include Gitlab::Utils::StrongMemoize
+
+      def extensions
+        EXTENSION_MUTEX.synchronize do
+          strong_memoize(:extensions) do
+            Set.new.tap do |set|
+              YAML.load_file(Rails.root.join('vendor', 'languages.yml')).each do |_name, details|
+                details['extensions']&.each do |ext|
+                  next unless ext.start_with?('.')
+
+                  set << ext.downcase
+                end
+              end
+            end
+          end
+        end
+      end
+
+      def clear_extensions!
+        EXTENSION_MUTEX.synchronize do
+          clear_memoization(:extensions)
+        end
+      end
+    end
+  end
+end
author	Robert Speicher <robert@gitlab.com>	2018-10-15 15:27:15 +0000
committer	Robert Speicher <robert@gitlab.com>	2018-10-15 15:27:15 +0000
commit	f5d088eb11b066dadad06948e158b39e9432573c (patch)
tree	9f1db7d7ba83f1b221bdb88b864e8efca1fd95d8 /lib
parent	201143e9db57ed6f4cd72704387777b4be5ecc34 (diff)
parent	399056ed783e12337a9c47b06b4aae021198f1cd (diff)
download	gitlab-ce-f5d088eb11b066dadad06948e158b39e9432573c.tar.gz