Commit 4443090

mo khan <mo.khan@gmail.com>
2020-01-26 02:30:57
Start to test out string similarity
1 parent 5ff59cc
lib/spandx/catalogue.rb
@@ -45,7 +45,7 @@ module Spandx
     end
 
     def map_from(license_hash)
-      License.new(license_hash)
+      License.new(license_hash, self)
     end
 
     def present?(item)
lib/spandx/content.rb
@@ -3,22 +3,282 @@
 module Spandx
   # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby
   class Content
-    attr_reader :bigrams
+    START_REGEX = /\A\s*/.freeze
+    END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i.freeze
+    REGEXES = {
+      hrs: /^\s*[=\-\*]{3,}\s*$/,
+      all_rights_reserved: /#{START_REGEX}all rights reserved\.?$/i,
+      whitespace: /\s+/,
+      markdown_headings: /#{START_REGEX}#+/,
+      version: /#{START_REGEX}version.*$/i,
+      span_markup: /[_*~]+(.*?)[_*~]+/,
+      link_markup: /\[(.+?)\]\(.+?\)/,
+      block_markup: /^\s*>/,
+      border_markup: /^[\*-](.*?)[\*-]$/,
+      comment_markup: %r{^\s*?[/\*]{1,2}},
+      url: %r{#{START_REGEX}https?://[^ ]+\n},
+      bullet: /\n\n\s*(?:[*-]|\(?[\da-z]{1,2}[)\.])\s+/i,
+      developed_by: /#{START_REGEX}developed by:.*?\n\n/im,
+      quote_begin: /[`'"‘“]/,
+      quote_end: /[`'"’”]/,
+      cc_legal_code: /^\s*Creative Commons Legal Code\s*$/i,
+      cc0_info: /For more information, please see\s*\S+zero\S+/im,
+      cc0_disclaimer: /CREATIVE COMMONS CORPORATION.*?\n\n/im,
+      unlicense_info: /For more information, please.*\S+unlicense\S+/im,
+      mit_optional: /\(including the next paragraph\)/i
+    }.freeze
+    COPYRIGHT_SYMBOLS = Regexp.union([/copyright/i, /\(c\)/i, "\u00A9", "\xC2\xA9"])
+    COPYRIGHT_REGEX = /#{START_REGEX}(?:portions )?(\s*#{COPYRIGHT_SYMBOLS}.*$)+$/i.freeze
+    NORMALIZATIONS = {
+      lists: { from: /^\s*(?:\d\.|\*)\s+([^\n])/, to: '- \1' },
+      https: { from: /http:/, to: 'https:' },
+      ampersands: { from: '&', to: 'and' },
+      dashes: { from: /(?<!^)([—–-]+)(?!$)/, to: '-' },
+      quotes: {
+        from: /#{REGEXES[:quote_begin]}+([\w -]*?\w)#{REGEXES[:quote_end]}+/,
+        to: '"\1"'
+      }
+    }.freeze
+    VARIETAL_WORDS = {
+      'acknowledgment' => 'acknowledgement',
+      'analogue' => 'analog',
+      'analyse' => 'analyze',
+      'artefact' => 'artifact',
+      'authorisation' => 'authorization',
+      'authorised' => 'authorized',
+      'calibre' => 'caliber',
+      'cancelled' => 'canceled',
+      'capitalisations' => 'capitalizations',
+      'catalogue' => 'catalog',
+      'categorise' => 'categorize',
+      'centre' => 'center',
+      'emphasised' => 'emphasized',
+      'favour' => 'favor',
+      'favourite' => 'favorite',
+      'fulfil' => 'fulfill',
+      'fulfilment' => 'fulfillment',
+      'initialise' => 'initialize',
+      'judgment' => 'judgement',
+      'labelling' => 'labeling',
+      'labour' => 'labor',
+      'licence' => 'license',
+      'maximise' => 'maximize',
+      'modelled' => 'modeled',
+      'modelling' => 'modeling',
+      'offence' => 'offense',
+      'optimise' => 'optimize',
+      'organisation' => 'organization',
+      'organise' => 'organize',
+      'practise' => 'practice',
+      'programme' => 'program',
+      'realise' => 'realize',
+      'recognise' => 'recognize',
+      'signalling' => 'signaling',
+      'sub-license' => 'sublicense',
+      'sub license' => 'sublicense',
+      'utilisation' => 'utilization',
+      'whilst' => 'while',
+      'wilful' => 'wilfull',
+      'non-commercial' => 'noncommercial',
+      'cent' => 'percent',
+      'owner' => 'holder'
+    }.freeze
+    STRIP_METHODS = %i[
+      cc0_optional
+      unlicense_optional
+      hrs
+      markdown_headings
+      borders
+      title
+      version
+      url
+      copyright
+      title
+      block_markup
+      span_markup
+      link_markup
+      all_rights_reserved
+      developed_by
+      end_of_terms
+      whitespace
+      mit_optional
+    ].freeze
 
-    def initialize(a)
-      @bigrams = to_bigrams(a)
+    attr_reader :tokens, :catalogue, :content
+
+    def initialize(content, catalogue)
+      @content = content
+      @catalogue = catalogue
+      @tokens = tokenize(content)
+    end
+
+    def similar?(other)
+      overlap = (wordset & other.wordset).size
+      total = wordset.size + other.wordset.size
+      100.0 * (overlap * 2.0 / total)
     end
 
     def similar?(other)
-      overlap = (bigrams & other.bigrams).size
-      total = bigrams.size + other.bigrams.size
-      overlap * 2.0 / total
+      overlap = (tokens & other.tokens).size
+      total = tokens.size + other.tokens.size
+      100.0 * (overlap * 2.0 / total)
+    end
+
+    def wordset
+      @wordset ||= content_normalized&.scan(/(?:\w(?:'s|(?<=s)')?)+/)&.to_set
     end
 
     private
 
-    def to_bigrams(x)
-      x.each_char.each_cons(2).to_a
+    def tokenize(string)
+      #string.downcase.split(/\W+/)
+      string.downcase.scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set
+    end
+
+    def content_normalized(wrap: nil)
+      @content_normalized ||=
+        begin
+          @_content = content_without_title_and_version.downcase
+          (NORMALIZATIONS.keys + %i[spelling bullets]).each { |x| normalize(x) }
+          STRIP_METHODS.each { |x| strip(x) }
+
+          _content
+        end
+
+      if wrap.nil?
+        @content_normalized
+      else
+        Licensee::ContentHelper.wrap(@content_normalized, wrap)
+      end
+    end
+
+    def content_without_title_and_version
+      @content_without_title_and_version ||=
+        begin
+          @_content = nil
+          [
+            :comments,
+            :hrs,
+            #:html,
+            #:markdown_headings,
+            :title,
+            :version
+          ].each { |x| strip(x) }
+          _content
+        end
+    end
+
+    def _content
+      @_content ||= content.to_s.dup.strip
+    end
+
+    def strip(regex_or_sym)
+      return unless _content
+
+      if regex_or_sym.is_a?(Symbol)
+        meth = "strip_#{regex_or_sym}"
+        return send(meth) if respond_to?(meth, true)
+
+        unless REGEXES[regex_or_sym]
+          raise ArgumentError, "#{regex_or_sym} is an invalid regex reference"
+        end
+
+        regex_or_sym = REGEXES[regex_or_sym]
+      end
+
+      @_content = _content.gsub(regex_or_sym, ' ').squeeze(' ').strip
+    end
+
+    def strip_comments
+      lines = _content.split("\n")
+      return if lines.count == 1
+      return unless lines.all? { |line| line =~ REGEXES[:comment_markup] }
+
+      strip(:comment_markup)
+    end
+
+    def strip_cc0_optional
+      return unless _content.include? 'associating cc0'
+
+      strip(REGEXES[:cc_legal_code])
+      strip(REGEXES[:cc0_info])
+      strip(REGEXES[:cc0_disclaimer])
+    end
+
+    def strip_unlicense_optional
+      return unless _content.include? 'unlicense'
+
+      strip(REGEXES[:unlicense_info])
+    end
+
+    def strip_borders
+      normalize(REGEXES[:border_markup], '\1')
+    end
+
+    def strip_copyright
+      strip(COPYRIGHT_REGEX) while _content =~ COPYRIGHT_REGEX
+    end
+
+    def strip_end_of_terms
+      body, _partition, _instructions = _content.partition(END_OF_TERMS_REGEX)
+      @_content = body
+    end
+
+    def strip_title
+      strip(title_regex) while _content =~ title_regex
+    end
+
+    def title_regex
+      @title_regex ||= begin
+        titles = catalogue.map { |x| title_regex_for(x) }
+        /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i
+      end
+    end
+
+    def normalize(from_or_key, to = nil)
+      operation = { from: from_or_key, to: to } if to
+      operation ||= NORMALIZATIONS[from_or_key]
+
+      if operation
+        @_content = _content.gsub operation[:from], operation[:to]
+      elsif respond_to?("normalize_#{from_or_key}", true)
+        send("normalize_#{from_or_key}")
+      else
+        raise ArgumentError, "#{from_or_key} is an invalid normalization"
+      end
+    end
+
+    def normalize_spelling
+      normalize(/\b#{Regexp.union(VARIETAL_WORDS.keys)}\b/, VARIETAL_WORDS)
+    end
+
+    def normalize_bullets
+      normalize(REGEXES[:bullet], "\n\n* ")
+      normalize(/\)\s+\(/, ')(')
+    end
+
+    def title_regex_for(license)
+      string = license.id.downcase.sub('*', 'u')
+      string.sub!(/\Athe /i, '')
+      string.sub!(/,? version /, ' ')
+      string.sub!(/v(\d+\.\d+)/, '\1')
+      string = Regexp.escape(string)
+      string = string.sub(/\\ licen[sc]e/i, '(?:\ licen[sc]e)?')
+      string = string.sub(/\\ (\d+\\.\d+)/, ',?\s+(?:version\ |v(?:\. )?)?\1')
+      string = string.sub(/\bgnu\\ /, '(?:GNU )?')
+      title_regex = Regexp.new string, 'i'
+
+      string = license.id.downcase.sub('-', '[- ]')
+      string.sub!('.', '\.')
+      string << '(?:\ licen[sc]e)?'
+      key_regex = Regexp.new string, 'i'
+
+      parts = [title_regex, key_regex]
+      # if meta.nickname
+      # parts.push Regexp.new meta.nickname.sub(/\bGNU /i, '(?:GNU )?')
+      # end
+
+      Regexp.union(parts)
     end
   end
 end
lib/spandx/guess.rb
@@ -11,7 +11,7 @@ module Spandx
       end
 
       def <=>(other)
-        self.score <=> other.score
+        score <=> other.score
       end
 
       def to_s
@@ -26,9 +26,9 @@ module Spandx
     end
 
     def license_for(content)
-      this = Content.new(content)
+      this = Content.new(content, catalogue)
       catalogue
-        .map { |x| Score.new(this.similar?(Content.new(x.details.text)), x) }
+        .map { |x| Score.new(this.similar?(x.content), x) }
         .max
         .item
         .id
lib/spandx/license.rb
@@ -14,10 +14,11 @@ module Spandx
       end
     end
 
-    attr_reader :attributes
+    attr_reader :attributes, :catalogue
 
-    def initialize(attributes = {})
+    def initialize(attributes = {}, catalogue)
       @attributes = attributes
+      @catalogue = catalogue
     end
 
     def id
@@ -72,6 +73,10 @@ module Spandx
       attributes[:referenceNumber] = value
     end
 
+    def content
+      @content ||= Content.new(details.text, catalogue)
+    end
+
     def details
       path = File.expand_path(File.join(File.dirname(__FILE__), "../../spec/fixtures/spdx/json/details/#{id}.json"))
       Details.new(JSON.parse(IO.read(path), symbolize_names: true))
spec/unit/content_spec.rb
@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+
+RSpec.describe Spandx::Content do
+  subject { described_class.new(content, catalogue) }
+
+  let(:catalogue) { Spandx::Catalogue.from_file(fixture_file('spdx.json')) }
+
+  let(:content) do
+    <<-LICENSE.gsub(/^\s*/, '')
+  # The MIT License
+  =================
+
+  Copyright 2020 Tsuyoshi Garrett
+  *************************
+
+  All rights reserved.
+
+  The made
+  * * * *
+  up  license.
+
+  This license provided 'as is'. Please respect the contributors' wishes when
+  implementing the license's "software".
+  -----------
+    LICENSE
+  end
+
+  describe '#wordset' do
+    it 'creates the wordset' do
+      wordset = Set.new(
+        %w[
+          the made up license this provided as is please respect
+          contributors' wishes when implementing license's software
+        ]
+      )
+      expect(subject.tokens).to eql(wordset)
+    end
+  end
+end
spec/unit/guess_spec.rb
@@ -1,8 +1,11 @@
+# frozen_string_literal: true
+
 RSpec.describe Spandx::Guess do
   subject { described_class.new(catalogue) }
+
   let(:catalogue) { Spandx::Catalogue.from_file(fixture_file('spdx.json')) }
 
-  describe "#license_for" do
+  describe '#license_for' do
     let(:gpl_content) { license_file(spdx_id) }
     let(:gpl) { catalogue[spdx_id] }
     let(:spdx_id) { 'GPL-3.0' }
spandx.gemspec
@@ -30,7 +30,7 @@ Gem::Specification.new do |spec|
   spec.require_paths = ['lib']
 
   spec.add_dependency 'bundler', '>= 1.16', '< 3.0.0'
-  #spec.add_dependency 'licensee', '~> 9.13'
+  # spec.add_dependency 'licensee', '~> 9.13'
   spec.add_dependency 'net-hippie', '~> 0.3'
   spec.add_dependency 'nokogiri', '~> 1.10'
   spec.add_dependency 'thor', '~> 0.1'