tfはterm frequencyの略での直訳すると単語の頻度となり、各文書においてその単語がどくらい出現したかを表す指標となる。このtfが大きい単語は頻度が高く出現する単語となり、その文書の特徴を示すという仮説が立てられる。tfは以下の式で表される。

\(\displaystyle tf=\frac{文書Aにおける単語Xの出現頻度}{文書Aにおける全単語の出現頻度の和} =\frac{n_{t,d}}{\sum_{x}n_{s,d}}\)

idfはinverse document frequencyの略で、直訳すると逆文書頻度となり、その単語が様々な文書に出現すると値が小さくなり、滅多に現れない場合は大きくなるものとなる。このidfが大きいと単語のレアさを表すこととなり、その文書の特徴を表すとの仮説が立てられる。idfは以下の式で表される。

\(\displaystyle idf=log(\frac{全文書数}{単語Xを含む文書数})=log\frac{N}{df(t)}+1 \)


ここで実際にA=[スポーツ 野球 野球 バット]という文書と、B=[サッカー サッカー サッカー スポーツ ゴール]という文書があった時、tf(スポーツ、文書A)*idf(スポーツ)=0.25*1=0.25、tf(野球、文書A)*id(野球)=0.5*2=1、tf(バット、文書A)*idf(バット)=0.25*2=0.5、tf(サッカー、文書B)*idf(スサッカー)=0.6*2=1.2、tf(スポーツ、文書B)*id(スポーツ)=0.2*1=0.2、tf(ゴール、文書B)*idf(バゴール)=0.2*2=0.4となる。これによりAとBに共通している「スポーツ」という単語はtfidfが小さくなり、それぞれの文書にしか存在しないでかつ沢山出現している「野球」や「サッカー」の単語のtfidfは大きい値を示す。

これらをClojureで実装したものはHolger Schauer氏の以下のものがある。

(defn freq
  "Returns a map from distinct items in coll to the number of times
  they appear. Returns a stateful transducer when no collection is provided."
   (fn [rf]
     (let [freqm (volatile! {})]
         ([] (rf))
         ([result] (rf result))
         ([result input]
          (vswap! freqm update input (fnil inc 0))
          (rf result @freqm))))))
   (into {} (freq) coll)))

(defn normalize-value [maxfreq curfreq]
  "Augment a frequency value, cf."
  ;; cf. or
  (-> (* 0.6 curfreq)
      (/ maxfreq)
      (+ 0.4)))

(defn tf
  "Returns a map of term frequencies for a sequence of words.
Keyword `normalize` defaults to true, returning an augemented term frequency."
  [wordseq & {:keys [normalize] :or {normalize true}}]
  (let [tfreqs (frequencies wordseq)]
    (if-not normalize
      (let [maxfreq (val (apply max-key val tfreqs))
            normalize-tf (map (fn [[term freq]]
                                   [term (normalize-value maxfreq freq)]))]
        (into {} normalize-tf tfreqs)))))

(defn tfmap-to-termvector [tf-row terms]
  "Convert tf-row into a vector of frequencies (potentially 0) for all terms in tf-row."
  (reduce (fn [tfvec term]
            (conj tfvec (get tf-row term 0)))
          [] terms))

(defn tf-from-docs [documents]
  "Returns a vector of all terms in documents and the related tf-vector for each document"
  (let [tf-rows (map tf documents)
        terms (vec (into #{} (flatten (map keys tf-rows))))]
    (vector terms
            (pmap #(tfmap-to-termvector % terms) tf-rows))))

(defn idf
  "Returns a map of the inverse document frequency for a sequence of texts (sequence of words)."
  (let [alltfs (map tf textseq)
        termdoccount (reduce (fn [result tfmap]
                               (reduce (fn [resmap [term _]]
                                           (update resmap term (fnil inc 0)))
                                       result tfmap))
                             {} alltfs)
        doccount (count textseq)]
    (reduce (fn [resmap [term docswithterm]]
              (assoc resmap term
                     (Math/log (/ (+ doccount 1) ; apply smoothing!
                                  (+ docswithterm 1)))))
            {} termdoccount)))

(defn tfidf
  "Returns a sequence of the terms and the tf-idf values for a sequence of texts (sequence of words)."
  (let [alltfs (pmap tf textseq)
        termdoccount (reduce (fn [result tfmap]
                               (reduce (fn [resmap [term _]]
                                           (update resmap term (fnil inc 0)))
                                       result tfmap))
                             {} alltfs)
        terms (keys termdoccount)
        doccount (count textseq)
        idf (reduce (fn [resmap [term docswithterm]]
              (assoc resmap term
                     (Math/log10 (/ doccount ; Note: no smoothing here!
            {} termdoccount)
        matrix (pmap (fn [tfpdoc]
                      (map (fn [term]
                             (* (get tfpdoc term 0)
                                (get idf term 0)))
    [terms matrix]))

(defn normalize-tf-xf
  "Returns a normalization of frequencies (either a single map or a collection of maps). 
Returns a transducer when no collection is provided."
   (fn [rf]
         ([] (rf))
         ([result] (rf result))
         ([result input]
          (let [newmax (val (apply max-key val input))
                normalize-maxvalue (partial normalize-value newmax)
                normalize-termfreq (juxt key (comp normalize-maxvalue val))]
            (rf result (into {} (map normalize-termfreq input))))))))
     (map? freqs) (into {} (normalize-tf-xf) [freqs])
     (sequential? freqs) (into {} (normalize-tf-xf) freqs)
     :else (throw (ex-info "Don't know how to normalize non-sequential / non-map like type"
                           {:data freqs})))))

(def norm-tf-xf
  "Transducer that will return normalized frequencies."
  (comp (freq) (normalize-tf-xf)))

(defn tf
  "Returns a map of term frequencies for a sequence of words.
Keyword `normalize` defaults to true, returning an augemented term frequency."
  [wordseq & {:keys [normalize] :or {normalize true}}]
  (if normalize
    (into {} norm-tf-xf wordseq)
    (freq wordseq)))

(defn tf-from-docs-xf
  "Returns a map of terms with the number of documents a term appears in and a list of related tf-vector for each document, sorted according to the terms.
Returns a stateful transducer when no collection is provided."
   (fn [rf]
     (let [termdoccount (volatile! (sorted-map))
           tfs (volatile! [])]
         ([] (rf))
         ([result] (rf result))
         ([result input]
          (let [newtdcount ; re-compute for each term how many documents contain it
                (reduce (fn [newtdcount term]
                          (update newtdcount term (fnil inc 0))) ; inc #term, even if missing (=0)
                        @termdoccount (keys input))
                termcount (count (keys newtdcount))              ; determine |terms|
                termzeromap (into (sorted-map)                   ; build up a sorted map
                                (zipmap (keys newtdcount)        ; of terms with vectors of
                                        (repeat termcount 0)))   ; length |terms| all set to 0
                currows (map (fn [tfdoc]                         ; re-map all existing tfs
                               (vals (merge termzeromap tfdoc))) ; so that they contains all terms
                             @tfs)                               ; with 0 or the old tf value
                newrow (vals (merge termzeromap input))
                currows (conj currows newrow)]
            (vswap! tfs conj input)
            (vreset! termdoccount newtdcount)
            (rf result {:terms @termdoccount :tfs currows})))))))
   (into {} (tf-from-docs-xf) (map tf coll))))

(defn idf-xf
  "Returns a map of the inverse document frequency for some documents. Expects the input to be a collection of sorted(!) maps of terms with number of documents the term appears in and a list of term frequencies.
Returns a transducer when called without a collection."
   (fn [rf]
       ([] (rf))
       ([result] (rf result))
       ([result input]
        (let [doccount (count (:tfs input))
              terms (map (fn [[term docswithterm]]
                           [term {:doccount docswithterm
                                  :idf (Math/log10 (/ doccount docswithterm))}])
                         (:terms input))]
          (rf result {:terms (into (sorted-map) terms) :tfs (:tfs input)}))))))
   (into {} (idf-xf) coll)))

(defn tfidf-xf
  "Returns a map of the terms, the tf and  tf-idf values for a sequence of texts (sequence of words),
given an input collection of sorted(!) maps of terms/doccount/idf and tf values.
  Returns a transducer if called without a collection."
   (fn [rf]
       ([] (rf))
       ([result] (rf result))
       ([result input]
        (let [tfidfs
              (map (fn [tfdoc]
                     ;; make use of the fact that the tf values are placed at exactly
                     ;; the same position as their corresponding term in the term vector
                     ;; by mapping over both tf and term vector in parallel
                     (map (fn [tfvalue [term {doccount :doccount idf :idf}]]
                            (* tfvalue idf))
                          tfdoc (:terms input)))
                   (:tfs input))]
          (rf result {:terms (:terms input) :tfs (:tfs input) :tfidfs tfidfs}))))))
   (into {} (tfidf-xf) coll)))


