AIって結局何なのかよく分からないので、とりあえず100日間勉強してみた Day96
経緯についてはこちらをご参照ください。
■本日の進捗
- 文章生成AIの構築
■はじめに
今回も「ゼロから作るDeep Learning② 自然言語処理編(オライリー・ジャパン)」から学んでいきます。
今回は、これまでLSTM言語モデルで学習させてきた重みを用いて生成AIモデルを作成してみます。
■文章生成
LSTM言語モデルでは文章の特徴を捉えて、その精度を高めるような重みを学習してきました。これらの重みを用いれば、現在の単語の次に来る単語を確率分布から予測し自然な言語を紡ぎ出せます。
今回は対話型のChatBotではなく簡単な文章生成のみを行うので、最初の単語はこちらから与えてあげますし、意味を持って変換を行うのではなく単に確率分布から自然な文章になることを目指します。
■文章生成クラス
まずは文章を作成するためのクラスを実装します。
以前実装したRnnlmクラスを継承する形で、生成するためのgenerateメソッドを用意します。ここでは生成を開始する単語ID(start_id)、スキップするべき単語ID(skip_ids:例えばNや<unk>など)、生成する単語の数(sample_size)を引数とします。
while文で文章がsample_sizeの長さになるまで繰り返し、単語スコアを予測しSoftmaxで確率分布(p)に落とし込みます。確率分布を用いて確率的にサンプリングし、skip_idsに含まれていなければ新たな単語として追加します。
class RnnlmGen(Rnnlm): def generate(self, start_id, skip_ids=None, sample_size=10): word_ids = [start_id] x = start_id while len(word_ids) < sample_size: x = np.array(x).reshape(1, 1) score = self.predict(x) p = softmax(score.flatten()) sampled = np.random.choice(len(p), size=1, p=p) if (skip_ids is None) or (sampled not in skip_ids): x = sampled word_ids.append(int(x)) return word_ids
get_stateメソッドとしてLSTM層の内部状態(隠れ状態とセル状態)を保存できるようにしておきます。
def get_state(self): return self.lstm_layer.h, self.lstm_layer.c
set_stateメソッドとしてLSTM層の内部状態を設定して生成を再開することができるようにします。
def set_state(self, state): self.lstm_layer.set_state(*state)
■生成AIモデル
それではLSTM言語モデルを用いて文章を生成してみます。今回は、下記のリンクにある学習済みの重みを用いて、最初の単語に”we”を与え、10単語からなる文章を生成してみます。
https://github.com/oreilly-japan/deep-learning-from-scratch-2
import sys import os sys.path.append('..') import numpy as np import matplotlib.pyplot as plt import pickle from sklearn.utils.extmath import randomized_svd import collections GPU = False # setting for PTB dataset key_file = { 'train':'ptb.train.txt', 'test':'ptb.test.txt', 'valid':'ptb.valid.txt' } save_file = { 'train':'ptb.train.npy', 'test':'ptb.test.npy', 'valid':'ptb.valid.npy' } vocab_file = 'ptb.vocab.pkl' dataset_dir = os.path.dirname(os.path.abspath(__file__)) mid_path = '..\..\Download_Dataset\lstm-master\data' def load_vocab(): vocab_path = os.path.join(dataset_dir, vocab_file) print(vocab_path) if os.path.exists(vocab_path): with open(vocab_path, 'rb') as f: word_to_id, id_to_word = pickle.load(f) return word_to_id, id_to_word word_to_id = {} id_to_word = {} data_type = 'train' file_name = key_file[data_type] file_path = os.path.join(dataset_dir, mid_path, file_name) words = open(file_path).read().replace('\n', '<eos>').strip().split() for i, word in enumerate(words): if word not in word_to_id: tmp_id = len(word_to_id) word_to_id[word] = tmp_id id_to_word[tmp_id] = word with open(vocab_path, 'wb') as f: pickle.dump((word_to_id, id_to_word), f) return word_to_id, id_to_word def load_data(data_type='train'): if data_type == 'val': data_type = 'valid' save_path = dataset_dir + '\\' + save_file[data_type] print('save_path:', save_path) word_to_id, id_to_word = load_vocab() if os.path.exists(save_path): corpus = np.load(save_path) return corpus, word_to_id, id_to_word file_name = key_file[data_type] file_path = os.path.join(dataset_dir, mid_path, file_name) words = open(file_path).read().replace('\n', '<eos>').strip().split() corpus = np.array([word_to_id[w] for w in words]) np.save(save_path, corpus) return corpus, word_to_id, id_to_word class Embedding: def __init__(self, W): self.params = [W] self.grads = [np.zeros_like(W)] self.idx = None def forward(self, idx): W, = self.params self.idx = idx out = W[idx] return out def backward(self, dout): dW, = self.grads dW[...] = 0 if GPU: np.scatter_add(dW, self.idx, dout) else: np.add.at(dW, self.idx, dout) return None def softmax(x): if x.ndim == 2: x = x - x.max(axis=1, keepdims=True) x = np.exp(x) x /= x.sum(axis=1, keepdims=True) elif x.ndim == 1: x = x - np.max(x) x = np.exp(x) / np.sum(np.exp(x)) return x def sigmoid(x): return 1 / (1 + np.exp(-x)) class LSTM: def __init__(self, Wx, Wh, b): self.params = [Wx, Wh, b] self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)] self.cache = None def forward(self, x, h_prev, c_prev): Wx, Wh, b = self.params N, H = h_prev.shape A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b f = A[:, :H] g = A[:, H:2*H] i = A[:, 2*H:3*H] o = A[:, 3*H:] f = sigmoid(f) g = np.tanh(g) i = sigmoid(i) o = sigmoid(o) c_next = f * c_prev + g * i h_next = o * np.tanh(c_next) self.cache = (x, h_prev, c_prev, i, f, g, o, c_next) return h_next, c_next def backward(self, dh_next, dc_next): Wx, Wh, b = self.params x, h_prev, c_prev, i, f, g, o, c_next = self.cache tanh_c_next = np.tanh(c_next) ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2) dc_prev = ds * f di = ds * g df = ds * c_prev do = dh_next * tanh_c_next dg = ds * i di *= i * (1 - i) df *= f * (1 - f) do *= o * (1 - o) dg *= (1 - g ** 2) dA = np.hstack((df, dg, di, do)) dWh = np.dot(h_prev.T, dA) dWx = np.dot(x.T, dA) db = dA.sum(axis=0) self.grads[0][...] = dWx self.grads[1][...] = dWh self.grads[2][...] = db dx = np.dot(dA, Wx.T) dh_prev = np.dot(dA, Wh.T) return dx, dh_prev, dc_prev class TimeLSTM: def __init__(self, Wx, Wh, b, stateful=False): self.params = [Wx, Wh, b] self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)] self.layers = None self.h, self.c = None, None self.dh = None self.stateful = stateful def forward(self, xs): Wx, Wh, b = self.params N, T, D = xs.shape H = Wh.shape[0] self.layers = [] hs = np.empty((N, T, H), dtype='f') if not self.stateful or self.h is None: self.h = np.zeros((N, H), dtype='f') if not self.stateful or self.c is None: self.c = np.zeros((N, H), dtype='f') for t in range(T): layer = LSTM(*self.params) self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c) hs[:, t, :] = self.h self.layers.append(layer) return hs def backward(self, dhs): Wx, Wh, b = self.params N, T, H = dhs.shape D = Wx.shape[0] dxs = np.empty((N, T, D), dtype='f') dh, dc = 0, 0 grads = [0, 0, 0] for t in reversed(range(T)): layer = self.layers[t] dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc) dxs[:, t, :] = dx for i, grad in enumerate(layer.grads): grads[i] += grad for i, grad in enumerate(grads): self.grads[i][...] = grad self.dh = dh return dxs def set_state(self, h, c=None): self.h, self.c = h, c def reset_state(self): self.h, self.c = None, None class TimeEmbedding: def __init__(self, W): self.params = [W] self.grads = [np.zeros_like(W)] self.layers = None self.W = W def forward(self, xs): N, T = xs.shape V, D = self.W.shape out = np.empty((N, T, D), dtype='f') self.layers = [] for t in range(T): layer = Embedding(self.W) out[:, t, :] = layer.forward(xs[:, t]) self.layers.append(layer) return out def backward(self, dout): N, T, D = dout.shape grad = 0 for t in range(T): layer = self.layers[t] layer.backward(dout[:, t, :]) grad += layer.grads[0] self.grads[0][...] = grad return None class TimeAffine: def __init__(self, W, b): self.params = [W, b] self.grads = [np.zeros_like(W), np.zeros_like(b)] self.x = None def forward(self, x): N, T, D = x.shape W, b = self.params rx = x.reshape(N*T, -1) out = np.dot(rx, W) + b self.x = x return out.reshape(N, T, -1) def backward(self, dout): x = self.x N, T, D = x.shape W, b = self.params dout = dout.reshape(N*T, -1) rx = x.reshape(N*T, -1) db = np.sum(dout, axis=0) dW = np.dot(rx.T, dout) dx = np.dot(dout, W.T) dx = dx.reshape(*x.shape) self.grads[0][...] = dW self.grads[1][...] = db return dx class TimeSoftmaxWithLoss: def __init__(self): self.params, self.grads = [], [] self.cache = None self.ignore_label = -1 def forward(self, xs, ts): N, T, V = xs.shape if ts.ndim == 3: ts = ts.argmax(axis=2) mask = (ts != self.ignore_label) xs = xs.reshape(N * T, V) ts = ts.reshape(N * T) mask = mask.reshape(N * T) ys = softmax(xs) ls = np.log(ys[np.arange(N * T), ts]) ls *= mask loss = -np.sum(ls) loss /= mask.sum() self.cache = (ts, ys, mask, (N, T, V)) return loss def backward(self, dout=1): ts, ys, mask, (N, T, V) = self.cache dx = ys dx[np.arange(N * T), ts] -= 1 dx *= dout dx /= mask.sum() dx *= mask[:, np.newaxis] dx = dx.reshape((N, T, V)) return dx class Rnnlm: def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs): for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs, ts): score = self.predict(xs) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): self.lstm_layer.reset_state() def save_params(self, file_name='Rnnlm.pkl'): with open(file_name, 'wb') as f: pickle.dump(self.params, f) def load_params(self, file_name='Rnnlm.pkl'): with open(file_name, 'rb') as f: self.params = pickle.load(f) class RnnlmGen(Rnnlm): def generate(self, start_id, skip_ids=None, sample_size=10): word_ids = [start_id] x = start_id while len(word_ids) < sample_size: x = np.array(x).reshape(1, 1) score = self.predict(x) p = softmax(score.flatten()) sampled = np.random.choice(len(p), size=1, p=p) if (skip_ids is None) or (sampled not in skip_ids): x = sampled word_ids.append(int(x)) return word_ids def get_state(self): return self.lstm_layer.h, self.lstm_layer.c def set_state(self, state): self.lstm_layer.set_state(*state) corpus, word_to_id, id_to_word = load_data('train') vocab_size = len(word_to_id) corpus_size = len(corpus) model = RnnlmGen() model.load_params('Rnnlm.pkl') start_word = 'we' start_id = word_to_id[start_word] skip_words = ['N', '<unk>', '$'] skip_ids = [word_to_id[w] for w in skip_words] word_ids = model.generate(start_id, skip_ids) txt = ' '.join([id_to_word[i] for i in word_ids]) txt = txt.replace(' <eos>', '.\n') print(txt)
we replied asian clout eliminate prentice thrifts disruption jurors undersecretary
最初の主語と動詞(過去形)は並びが自然ですが、それ以外は文章として意味不明な並びになっています。
同じくLSTM言語モデルの一種であるGoogle翻訳に変換してもらうと、下記のようになりました。
私たちは答えた アジアの影響力 プレンティス・スリフツの排除 陪審員の混乱 次官
日本語にしても意味が分かりません。
■性能向上したモデルを用いた生成AI
前回実装した、LSTM層の多層化、Dropout、重み共有を用いて性能向上を目指したLSTM言語モデルで文章を生成してみます。このモデルは実装済みなので使用するクラスを変更する以外はほとんど同じ実装になります。引数となるパラメータも同様です。
import sys import os sys.path.append('..') import numpy as np import matplotlib.pyplot as plt import pickle from sklearn.utils.extmath import randomized_svd import collections GPU = False # setting for PTB dataset key_file = { 'train':'ptb.train.txt', 'test':'ptb.test.txt', 'valid':'ptb.valid.txt' } save_file = { 'train':'ptb.train.npy', 'test':'ptb.test.npy', 'valid':'ptb.valid.npy' } vocab_file = 'ptb.vocab.pkl' dataset_dir = os.path.dirname(os.path.abspath(__file__)) mid_path = '..\..\Download_Dataset\lstm-master\data' def load_vocab(): vocab_path = os.path.join(dataset_dir, vocab_file) print(vocab_path) if os.path.exists(vocab_path): with open(vocab_path, 'rb') as f: word_to_id, id_to_word = pickle.load(f) return word_to_id, id_to_word word_to_id = {} id_to_word = {} data_type = 'train' file_name = key_file[data_type] file_path = os.path.join(dataset_dir, mid_path, file_name) words = open(file_path).read().replace('\n', '<eos>').strip().split() for i, word in enumerate(words): if word not in word_to_id: tmp_id = len(word_to_id) word_to_id[word] = tmp_id id_to_word[tmp_id] = word with open(vocab_path, 'wb') as f: pickle.dump((word_to_id, id_to_word), f) return word_to_id, id_to_word def load_data(data_type='train'): if data_type == 'val': data_type = 'valid' save_path = dataset_dir + '\\' + save_file[data_type] print('save_path:', save_path) word_to_id, id_to_word = load_vocab() if os.path.exists(save_path): corpus = np.load(save_path) return corpus, word_to_id, id_to_word file_name = key_file[data_type] file_path = os.path.join(dataset_dir, mid_path, file_name) words = open(file_path).read().replace('\n', '<eos>').strip().split() corpus = np.array([word_to_id[w] for w in words]) np.save(save_path, corpus) return corpus, word_to_id, id_to_word class Embedding: def __init__(self, W): self.params = [W] self.grads = [np.zeros_like(W)] self.idx = None def forward(self, idx): W, = self.params self.idx = idx out = W[idx] return out def backward(self, dout): dW, = self.grads dW[...] = 0 if GPU: np.scatter_add(dW, self.idx, dout) else: np.add.at(dW, self.idx, dout) return None def softmax(x): if x.ndim == 2: x = x - x.max(axis=1, keepdims=True) x = np.exp(x) x /= x.sum(axis=1, keepdims=True) elif x.ndim == 1: x = x - np.max(x) x = np.exp(x) / np.sum(np.exp(x)) return x def sigmoid(x): return 1 / (1 + np.exp(-x)) class LSTM: def __init__(self, Wx, Wh, b): self.params = [Wx, Wh, b] self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)] self.cache = None def forward(self, x, h_prev, c_prev): Wx, Wh, b = self.params N, H = h_prev.shape A = np.dot(x, Wx) + np.dot(h_prev, Wh) + b f = A[:, :H] g = A[:, H:2*H] i = A[:, 2*H:3*H] o = A[:, 3*H:] f = sigmoid(f) g = np.tanh(g) i = sigmoid(i) o = sigmoid(o) c_next = f * c_prev + g * i h_next = o * np.tanh(c_next) self.cache = (x, h_prev, c_prev, i, f, g, o, c_next) return h_next, c_next def backward(self, dh_next, dc_next): Wx, Wh, b = self.params x, h_prev, c_prev, i, f, g, o, c_next = self.cache tanh_c_next = np.tanh(c_next) ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2) dc_prev = ds * f di = ds * g df = ds * c_prev do = dh_next * tanh_c_next dg = ds * i di *= i * (1 - i) df *= f * (1 - f) do *= o * (1 - o) dg *= (1 - g ** 2) dA = np.hstack((df, dg, di, do)) dWh = np.dot(h_prev.T, dA) dWx = np.dot(x.T, dA) db = dA.sum(axis=0) self.grads[0][...] = dWx self.grads[1][...] = dWh self.grads[2][...] = db dx = np.dot(dA, Wx.T) dh_prev = np.dot(dA, Wh.T) return dx, dh_prev, dc_prev class TimeLSTM: def __init__(self, Wx, Wh, b, stateful=False): self.params = [Wx, Wh, b] self.grads = [np.zeros_like(Wx), np.zeros_like(Wh), np.zeros_like(b)] self.layers = None self.h, self.c = None, None self.dh = None self.stateful = stateful def forward(self, xs): Wx, Wh, b = self.params N, T, D = xs.shape H = Wh.shape[0] self.layers = [] hs = np.empty((N, T, H), dtype='f') if not self.stateful or self.h is None: self.h = np.zeros((N, H), dtype='f') if not self.stateful or self.c is None: self.c = np.zeros((N, H), dtype='f') for t in range(T): layer = LSTM(*self.params) self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c) hs[:, t, :] = self.h self.layers.append(layer) return hs def backward(self, dhs): Wx, Wh, b = self.params N, T, H = dhs.shape D = Wx.shape[0] dxs = np.empty((N, T, D), dtype='f') dh, dc = 0, 0 grads = [0, 0, 0] for t in reversed(range(T)): layer = self.layers[t] dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc) dxs[:, t, :] = dx for i, grad in enumerate(layer.grads): grads[i] += grad for i, grad in enumerate(grads): self.grads[i][...] = grad self.dh = dh return dxs def set_state(self, h, c=None): self.h, self.c = h, c def reset_state(self): self.h, self.c = None, None class TimeEmbedding: def __init__(self, W): self.params = [W] self.grads = [np.zeros_like(W)] self.layers = None self.W = W def forward(self, xs): N, T = xs.shape V, D = self.W.shape out = np.empty((N, T, D), dtype='f') self.layers = [] for t in range(T): layer = Embedding(self.W) out[:, t, :] = layer.forward(xs[:, t]) self.layers.append(layer) return out def backward(self, dout): N, T, D = dout.shape grad = 0 for t in range(T): layer = self.layers[t] layer.backward(dout[:, t, :]) grad += layer.grads[0] self.grads[0][...] = grad return None class TimeAffine: def __init__(self, W, b): self.params = [W, b] self.grads = [np.zeros_like(W), np.zeros_like(b)] self.x = None def forward(self, x): N, T, D = x.shape W, b = self.params rx = x.reshape(N*T, -1) out = np.dot(rx, W) + b self.x = x return out.reshape(N, T, -1) def backward(self, dout): x = self.x N, T, D = x.shape W, b = self.params dout = dout.reshape(N*T, -1) rx = x.reshape(N*T, -1) db = np.sum(dout, axis=0) dW = np.dot(rx.T, dout) dx = np.dot(dout, W.T) dx = dx.reshape(*x.shape) self.grads[0][...] = dW self.grads[1][...] = db return dx class TimeSoftmaxWithLoss: def __init__(self): self.params, self.grads = [], [] self.cache = None self.ignore_label = -1 def forward(self, xs, ts): N, T, V = xs.shape if ts.ndim == 3: ts = ts.argmax(axis=2) mask = (ts != self.ignore_label) xs = xs.reshape(N * T, V) ts = ts.reshape(N * T) mask = mask.reshape(N * T) ys = softmax(xs) ls = np.log(ys[np.arange(N * T), ts]) ls *= mask loss = -np.sum(ls) loss /= mask.sum() self.cache = (ts, ys, mask, (N, T, V)) return loss def backward(self, dout=1): ts, ys, mask, (N, T, V) = self.cache dx = ys dx[np.arange(N * T), ts] -= 1 dx *= dout dx /= mask.sum() dx *= mask[:, np.newaxis] dx = dx.reshape((N, T, V)) return dx class Rnnlm: def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs): for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs, ts): score = self.predict(xs) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): self.lstm_layer.reset_state() def save_params(self, file_name='Rnnlm.pkl'): with open(file_name, 'wb') as f: pickle.dump(self.params, f) def load_params(self, file_name='Rnnlm.pkl'): with open(file_name, 'rb') as f: self.params = pickle.load(f) class TimeDropout: def __init__(self, dropout_ratio=0.5): self.params, self.grads = [], [] self.dropout_ratio = dropout_ratio self.mask = None self.train_flg = True def forward(self, xs): if self.train_flg: flg = np.random.rand(*xs.shape) > self.dropout_ratio scale = 1 / (1.0 - self.dropout_ratio) self.mask = flg.astype(np.float32) * scale return xs * self.mask else: return xs def backward(self, dout): return dout * self.mask class BetterRnnlm(Rnnlm): def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b1 = np.zeros(4 * H).astype('f') lstm_Wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b2 = np.zeros(4 * H).astype('f') affine_b = np.zeros(V).astype('f') self.layers = [ TimeEmbedding(embed_W), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True), TimeDropout(dropout_ratio), TimeAffine(embed_W.T, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layers = [self.layers[2], self.layers[4]] self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]] self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs, train_flg=False): for layer in self.drop_layers: layer.train_flg = train_flg for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs, ts, train_flg=True): score = self.predict(xs, train_flg) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): for layer in self.lstm_layers: layer.reset_state() class BetterRnnlmGen(BetterRnnlm): def generate(self, start_id, skip_ids=None, sample_size=10): word_ids = [start_id] x = start_id while len(word_ids) < sample_size: x = np.array(x).reshape(1, 1) score = self.predict(x).flatten() p = softmax(score).flatten() sampled = np.random.choice(len(p), size=1, p=p) if (skip_ids is None) or (sampled not in skip_ids): x = sampled word_ids.append(int(x)) return word_ids def get_state(self): states = [] for layer in self.lstm_layers: states.append((layer.h, layer.c)) return states def set_state(self, states): for layer, state in zip(self.lstm_layers, states): layer.set_state(*state) corpus, word_to_id, id_to_word = load_data('train') vocab_size = len(word_to_id) corpus_size = len(corpus) model = BetterRnnlmGen() model.load_params('Rnnlm.pkl') start_word = 'we' start_id = word_to_id[start_word] skip_words = ['N', '<unk>', '$'] skip_ids = [word_to_id[w] for w in skip_words] word_ids = model.generate(start_id, skip_ids) txt = ' '.join([id_to_word[i] for i in word_ids]) txt = txt.replace(' <eos>', '.\n') print(txt)
we renamed researcher taught elizabeth copy relative persistent prudential-bache did
ちょっと不自然ですが、少しましになったでしょうか。同様にGoogle翻訳に変換してもらうと下記のようになりました。
私たちは、研究者の名前を変更し、エリザベスに教え、相対的に持続的なプルデンシャル・バチェをコピーしました
先程の単語の羅列からは幾らか改善していて、局所的に正しい文章構成になっています。
■おわりに
今回はこれまでのLSTM言語モデルで学習した重みを用いて確率分布から次の単語を生成し、文章を生成するモデルを試してみました。まだまだ教師データを増やしたり層数を増やせばもっと自然な文章を生成できる可能性は秘めていますが、性能向上したLSTM言語モデルでもそれなりに文章らしいものを生成することができました。
仕事(とかF1とかF1とかF2とかSGTとかF1)にコミットしながらも今回で96日間続けてきたAIに関するお勉強ですが、ようやく生成AIらしきものを理解してライブラリを用いずにソースをいじくるところまでこれました。
本来の目的を満たせるような学習ができたので、安心しながら残りの4日間を頑張ろうと決心しつつ本日のお勉強を終えようと思います。
■参考文献
- Andreas C. Muller, Sarah Guido. Pythonではじめる機械学習. 中田 秀基 訳. オライリー・ジャパン. 2017. 392p.
- 斎藤 康毅. ゼロから作るDeep Learning Pythonで学ぶディープラーニングの理論と実装. オライリー・ジャパン. 2016. 320p.
- 斎藤 康毅. ゼロから作るDeep Learning② 自然言語処理編. オライリー・ジャパン. 2018. 432p.
- ChatGPT. 4o mini. OpenAI. 2024. https://chatgpt.com/
- API Reference. scikit-learn.org. https://scikit-learn.org/stable/api/index.html
- PyTorch documentation. pytorch.org. https://pytorch.org/docs/stable/index.html
- Keiron O’Shea, Ryan Nash. An Introduction to Convolutional Neural Networks. https://ar5iv.labs.arxiv.org/html/1511.08458
- API Reference. scipy.org. 2024. https://docs.scipy.org/doc/scipy/reference/index.html