Artificial Intelligence Technology Machine Learning Technology Natural Language Processing Clojure
Clojure implementation of one-hot-vector and category vector, used in machine learning for natural language processing. (Except for morphological analysis and CSV input/output, it is implemented without using external libraries as much as possible.
(ns test01.core
(:require [clojure.data.csv :as csv]
[clojure.java.io :as io]
[clojure.string :as str])
(:import (java.io File FileInputStream InputStreamReader BufferedReader StringReader
BufferedWriter OutputStreamWriter FileOutputStream)
(org.apache.lucene.analysis.ja JapaneseAnalyzer JapaneseTokenizer)
(org.apache.lucene.analysis.ja.tokenattributes PartOfSpeechAttribute)
(org.apache.lucene.analysis.tokenattributes CharTermAttribute OffsetAttribute)
(org.apache.lucene.analysis.util CharArraySet)))
;;Read data from csv
(def data-reads
(with-open [reader (io/reader "data/in/test-data.csv")] ;;read test-data from folder
(doall
(csv/read-csv reader))))
;;morphological function
(defn morphological-analysis
[src]
(let [analyzer (JapaneseAnalyzer. nil
JapaneseTokenizer/DEFAULT_MODE
CharArraySet/EMPTY_SET
#{})
rdr (StringReader. src)]
(with-open [ts (.tokenStream analyzer "field" rdr)]
(let [^OffsetAttribute offsetAtt (.addAttribute ts OffsetAttribute)
^PartOfSpeechAttribute posAtt (.addAttribute ts PartOfSpeechAttribute)
_ (.reset ts)
surface #(subs src (.startOffset offsetAtt) (.endOffset offsetAtt))
pos #(.getPartOfSpeech posAtt)
tokens (->> #(if (.incrementToken ts)
[(surface) (pos)]
nil)
repeatedly
(take-while identity)
doall)
_ (.end ts)]
tokens))))
;;Transform morphological analysis results into (part of speech, word) form
(defn simple-morph [n]
(map #(list (first (str/split (% 1) #"-")) (% 0))
(morphological-analysis n)))
;;pick up named entity
(defn meisi-token [n]
(keep
#(cond
(= "名詞" (first %)) (second %)
:else nil)(simple-morph n)))
;;Create a dictionary of words that appear by extracting stop word data (stop-word-num.txt, etc.))
(def word-dic-raw3
(->>
(distinct (flatten (map #(meisi-token (nth (nth data-reads %) 1))(range (count data-reads)))))
(remove (set (read-string (slurp "data/in/stop-word-num.txt"))))
(remove (set (read-string (slurp "data/in/stop-word-kana.txt"))))
(remove (set (read-string (slurp "data/in/stop-word1w.txt"))))))
;;Number of word-dic
(def count-word-dic (count word-dic-raw3))
count-word-dic
;;Creating zipmap data with word-dic
(def word-dic3
(zipmap
(range (count word-dic-raw3))
word-dic-raw3))
;;Extract the category No. that matches the mth word of the nth sentence
(defn pickup-category-num [n m]
(->> word-dic3
(filter #(= (second %) (nth (meisi-token (nth data-reads n)) m)))
first
first))
;;Apply to all words in the nth sentence of each (create a category vector)
(defn category-vec [n]
(map #(pickup-category-num n %)
(range (count (meisi-token (nth data-reads n))))))
;;Transformation of category-vector (map extracting frequencies of category vectors)
(defn frequencies-category-vec [n]
(frequencies (category-vec n)))
;;Set the nth word to 1 out of the m words in the raw data to create one-hot-vector.
(defn one-hot-vec [n m]
(concat
(vec (take (- n 1) (repeat 0)))
[1]
(vec (take (- m n) (repeat 0)))))
;;Create one-hot-vector in matrix form (create one-hot-vector with nth sentence)
(defn doc-one-hot-vec [n]
(vec (map #(vec (one-hot-vec % count-word-dic))(category-vec n))))
;;Same one-hot-vector count and matrix-to-vector (squashed one-hot-vec)
;;Vector addition
(defn sum-doc-one-hot-vec [n]
(loop [x (rest (doc-one-hot-vec n))
y (first (doc-one-hot-vec n))]
(if (empty? x)
y
(recur (rest x)(map + y (first x))))))
The data created by these methods can be used for various types of classification and deep learning processes.
コメント