From 399056ed783e12337a9c47b06b4aae021198f1cd Mon Sep 17 00:00:00 2001
From: Zeger-Jan van de Weg <git@zjvandeweg.nl>
Date: Fri, 3 Aug 2018 15:24:26 +0200
Subject: Remove dependencies on Linguist

This saves about 128 MB of baseline RAM usage per Unicorn and
Sidekiq process (!).

Linguist wasn't detecting languages anymore from CE/EE since
9ae8b57467ac8b38f1fa9020a466d94a93cbb9dd. However, Linguist::BlobHelper
was still being depended on by BlobLike and others.

This removes the Linguist gem, given it isn't required anymore.
EscapeUtils were pulled in as dependency, but given Banzai depends on
it, it is now added explicitly.

Previously, Linguist was used to detect the best ACE mode. Instead,
we rely on ACE to guess the best mode based on the file extension.
---
 lib/gitlab/blob_helper.rb      | 145 +++++++++++++++++++++++++++++++++++++++++
 lib/gitlab/conflict/file.rb    |   1 -
 lib/gitlab/git/blob.rb         |   8 +--
 lib/gitlab/git/blob_snippet.rb |  34 ----------
 lib/gitlab/language_data.rb    |  33 ++++++++++
 5 files changed, 182 insertions(+), 39 deletions(-)
 create mode 100644 lib/gitlab/blob_helper.rb
 delete mode 100644 lib/gitlab/git/blob_snippet.rb
 create mode 100644 lib/gitlab/language_data.rb

(limited to 'lib')

diff --git a/lib/gitlab/blob_helper.rb b/lib/gitlab/blob_helper.rb
new file mode 100644
index 00000000000..9b3b383b0c8
--- /dev/null
+++ b/lib/gitlab/blob_helper.rb
@@ -0,0 +1,145 @@
+# This has been extracted from https://github.com/github/linguist/blob/master/lib/linguist/blob_helper.rb
+module Gitlab
+  module BlobHelper
+    def extname
+      File.extname(name.to_s)
+    end
+
+    def known_extension?
+      LanguageData.extensions.include?(extname)
+    end
+
+    def viewable?
+      !large? && text?
+    end
+
+    MEGABYTE = 1024 * 1024
+
+    def large?
+      size.to_i > MEGABYTE
+    end
+
+    def binary?
+      # Large blobs aren't even loaded into memory
+      if data.nil?
+        true
+
+      # Treat blank files as text
+      elsif data == ""
+        false
+
+      # Charlock doesn't know what to think
+      elsif encoding.nil?
+        true
+
+      # If Charlock says its binary
+      else
+        detect_encoding[:type] == :binary
+      end
+    end
+
+    def text?
+      !binary?
+    end
+
+    def image?
+      ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
+    end
+
+    # Internal: Lookup mime type for extension.
+    #
+    # Returns a MIME::Type
+    # rubocop:disable Gitlab/ModuleWithInstanceVariables
+    def _mime_type
+      if defined? @_mime_type
+        @_mime_type
+      else
+        guesses = ::MIME::Types.type_for(extname.to_s)
+
+        # Prefer text mime types over binary
+        @_mime_type = guesses.detect { |type| type.ascii? } || guesses.first
+      end
+    end
+    # rubocop:enable Gitlab/ModuleWithInstanceVariables
+
+    # Public: Get the actual blob mime type
+    #
+    # Examples
+    #
+    #   # => 'text/plain'
+    #   # => 'text/html'
+    #
+    # Returns a mime type String.
+    def mime_type
+      _mime_type ? _mime_type.to_s : 'text/plain'
+    end
+
+    def binary_mime_type?
+      _mime_type ? _mime_type.binary? : false
+    end
+
+    def lines
+      @lines ||=
+        if viewable? && data
+          # `data` is usually encoded as ASCII-8BIT even when the content has
+          # been detected as a different encoding. However, we are not allowed
+          # to change the encoding of `data` because we've made the implicit
+          # guarantee that each entry in `lines` is encoded the same way as
+          # `data`.
+          #
+          # Instead, we re-encode each possible newline sequence as the
+          # detected encoding, then force them back to the encoding of `data`
+          # (usually a binary encoding like ASCII-8BIT). This means that the
+          # byte sequence will match how newlines are likely encoded in the
+          # file, but we don't have to change the encoding of `data` as far as
+          # Ruby is concerned. This allows us to correctly parse out each line
+          # without changing the encoding of `data`, and
+          # also--importantly--without having to duplicate many (potentially
+          # large) strings.
+          begin
+            data.split(encoded_newlines_re, -1)
+          rescue Encoding::ConverterNotFoundError
+            # The data is not splittable in the detected encoding.  Assume it's
+            # one big line.
+            [data]
+          end
+        else
+          []
+        end
+    end
+
+    def content_type
+      # rubocop:disable Style/MultilineTernaryOperator
+      # rubocop:disable Style/NestedTernaryOperator
+      @content_type ||= binary_mime_type? || binary? ? mime_type :
+                          (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
+      # rubocop:enable Style/NestedTernaryOperator
+      # rubocop:enable Style/MultilineTernaryOperator
+    end
+
+    def encoded_newlines_re
+      @encoded_newlines_re ||=
+        Regexp.union(["\r\n", "\r", "\n"].map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) })
+    end
+
+    def ruby_encoding
+      if hash = detect_encoding
+        hash[:ruby_encoding]
+      end
+    end
+
+    def encoding
+      if hash = detect_encoding
+        hash[:encoding]
+      end
+    end
+
+    def detect_encoding
+      @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
+    end
+
+    def empty?
+      data.nil? || data == ""
+    end
+  end
+end
diff --git a/lib/gitlab/conflict/file.rb b/lib/gitlab/conflict/file.rb
index 2a0cb640a14..30911b49b18 100644
--- a/lib/gitlab/conflict/file.rb
+++ b/lib/gitlab/conflict/file.rb
@@ -158,7 +158,6 @@ module Gitlab
         json_hash.tap do |json_hash|
           if opts[:full_content]
             json_hash[:content] = content
-            json_hash[:blob_ace_mode] = our_blob && our_blob.language.try(:ace_mode)
           else
             json_hash[:sections] = sections if type.text?
             json_hash[:type] = type
diff --git a/lib/gitlab/git/blob.rb b/lib/gitlab/git/blob.rb
index 71857bd2d87..13b0bb930f4 100644
--- a/lib/gitlab/git/blob.rb
+++ b/lib/gitlab/git/blob.rb
@@ -3,13 +3,13 @@
 module Gitlab
   module Git
     class Blob
-      include Linguist::BlobHelper
+      include Gitlab::BlobHelper
       include Gitlab::EncodingHelper
 
       # This number is the maximum amount of data that we want to display to
-      # the user. We load as much as we can for encoding detection
-      # (Linguist) and LFS pointer parsing. All other cases where we need full
-      # blob data should use load_all_data!.
+      # the user. We load as much as we can for encoding detection and LFS
+      # pointer parsing. All other cases where we need full blob data should
+      # use load_all_data!.
       MAX_DATA_DISPLAY_SIZE = 10.megabytes
 
       # These limits are used as a heuristic to ignore files which can't be LFS
diff --git a/lib/gitlab/git/blob_snippet.rb b/lib/gitlab/git/blob_snippet.rb
deleted file mode 100644
index 68116e775c6..00000000000
--- a/lib/gitlab/git/blob_snippet.rb
+++ /dev/null
@@ -1,34 +0,0 @@
-# Gitaly note: JV: no RPC's here.
-
-module Gitlab
-  module Git
-    class BlobSnippet
-      include Linguist::BlobHelper
-
-      attr_accessor :ref
-      attr_accessor :lines
-      attr_accessor :filename
-      attr_accessor :startline
-
-      def initialize(ref, lines, startline, filename)
-        @ref, @lines, @startline, @filename = ref, lines, startline, filename
-      end
-
-      def data
-        lines&.join("\n")
-      end
-
-      def name
-        filename
-      end
-
-      def size
-        data.length
-      end
-
-      def mode
-        nil
-      end
-    end
-  end
-end
diff --git a/lib/gitlab/language_data.rb b/lib/gitlab/language_data.rb
new file mode 100644
index 00000000000..bfdd7175198
--- /dev/null
+++ b/lib/gitlab/language_data.rb
@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module LanguageData
+    EXTENSION_MUTEX = Mutex.new
+
+    class << self
+      include Gitlab::Utils::StrongMemoize
+
+      def extensions
+        EXTENSION_MUTEX.synchronize do
+          strong_memoize(:extensions) do
+            Set.new.tap do |set|
+              YAML.load_file(Rails.root.join('vendor', 'languages.yml')).each do |_name, details|
+                details['extensions']&.each do |ext|
+                  next unless ext.start_with?('.')
+
+                  set << ext.downcase
+                end
+              end
+            end
+          end
+        end
+      end
+
+      def clear_extensions!
+        EXTENSION_MUTEX.synchronize do
+          clear_memoization(:extensions)
+        end
+      end
+    end
+  end
+end
-- 
cgit v1.2.1