Commit 51cfa48

mo khan <mo.khan@gmail.com>
2020-01-27 18:19:16
Use simple canonical form in dice coefficient algorithm
1 parent 5fa3d61
Changed files (5)
lib/spandx/content/text.rb
@@ -5,19 +5,18 @@ module Spandx
     class Text
       attr_reader :tokens
 
-      def initialize(content, catalogue)
+      def initialize(content)
         @content = content
-        @stripper = Stripper.new(catalogue)
         @tokens = tokenize(content)
       end
 
       def similar?(other)
-        score = self <=> other
+        score = dice_coefficient(other)
         score > 89.0
       end
 
       # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby
-      def <=>(other)
+      def dice_coefficient(other)
         overlap = (tokens & other.tokens).size
         total = tokens.size + other.tokens.size
         100.0 * (overlap * 2.0 / total)
@@ -25,24 +24,15 @@ module Spandx
 
       private
 
-      attr_reader :content, :stripper
+      attr_reader :content
 
       def tokenize(content)
-        content = canonicalize(content)
-        content = stripper.strip(content)
-        content.downcase.scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set
+        canonicalize(content).scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set
       end
 
       def canonicalize(content)
-        NORMALIZATIONS.each do |key, hash|
-          content = content.gsub(hash[:from], hash[:to])
-        end
-        content
-          .gsub(/\b#{Regexp.union(WORDS.keys)}\b/, WORDS)
-          .gsub(REGEXES[:bullet], "\n\n* ")
-          .gsub(/\)\s+\(/, ')(')
+        content.downcase
       end
-
     end
   end
 end
lib/spandx/catalogue.rb
@@ -45,7 +45,7 @@ module Spandx
     end
 
     def map_from(license_hash)
-      License.new(license_hash, self)
+      License.new(license_hash)
     end
 
     def present?(item)
lib/spandx/guess.rb
@@ -10,6 +10,14 @@ module Spandx
         @item = item
       end
 
+      def >(other)
+        score > other.score
+      end
+
+      def <(other)
+        score < other.score
+      end
+
       def <=>(other)
         score <=> other.score
       end
@@ -26,12 +34,14 @@ module Spandx
     end
 
     def license_for(content)
-      this = Content::Text.new(content, catalogue)
-      catalogue
-        .map { |x| Score.new(this.similar?(x.content), x) }
-        .max
-        .item
-        .id
+      this = Content::Text.new(content)
+
+      max_score = catalogue.map do |license|
+        percentage = this.dice_coefficient(license.content)
+        Score.new(percentage, license)
+      end.max
+
+      max_score.item.id
     end
   end
 end
lib/spandx/license.rb
@@ -14,11 +14,10 @@ module Spandx
       end
     end
 
-    attr_reader :attributes, :catalogue
+    attr_reader :attributes
 
-    def initialize(attributes = {}, catalogue)
+    def initialize(attributes = {})
       @attributes = attributes
-      @catalogue = catalogue
     end
 
     def id
@@ -74,7 +73,7 @@ module Spandx
     end
 
     def content
-      @content ||= Content::Text.new(details.text, catalogue)
+      @content ||= Content::Text.new(details.text)
     end
 
     def details
spec/unit/content/text_spec.rb
@@ -1,9 +1,8 @@
 # frozen_string_literal: true
 
 RSpec.describe Spandx::Content::Text do
-  subject { described_class.new(content, catalogue) }
+  subject { described_class.new(content) }
 
-  let(:catalogue) { Spandx::Catalogue.from_file(fixture_file('spdx.json')) }
   let(:content) do
     license_file('MIT')
       .gsub('<year>', Time.now.year.to_s)
@@ -11,9 +10,11 @@ RSpec.describe Spandx::Content::Text do
   end
 
   describe "#similar?" do
-    let(:mit) { described_class.new(license_file('MIT'), catalogue) }
+    let(:mit) { described_class.new(license_file('MIT')) }
+    let(:lgpl) { described_class.new(license_file('LGPL-2.0')) }
 
     specify { expect(subject.similar?(mit)).to be(true)  }
+    specify { expect(subject.similar?(lgpl)).to be(false)  }
     specify { expect(subject.similar?(subject)).to be(true) }
   end
 end