## @package char_rnn # Module caffe2.python.examples.char_rnn from caffe2.python import core, workspace, model_helper, utils, brew from caffe2.python.rnn_cell import LSTM from caffe2.proto import caffe2_pb2 from caffe2.python.optimizer import build_sgd import argparse import logging import numpy as np from datetime import datetime ''' This script takes a text file as input and uses a recurrent neural network to learn to predict next character in a sequence. ''' logging.basicConfig() log = logging.getLogger("char_rnn") log.setLevel(logging.DEBUG) # Default set() here is intentional as it would accumulate values like a global # variable def CreateNetOnce(net, created_names=set()): # noqa name = net.Name() if name not in created_names: created_names.add(name) workspace.CreateNet(net) class CharRNN(object): def __init__(self, args): self.seq_length = args.seq_length self.batch_size = args.batch_size self.iters_to_report = args.iters_to_report self.hidden_size = args.hidden_size with open(args.train_data) as f: self.text = f.read() self.vocab = list(set(self.text)) self.char_to_idx = {ch: idx for idx, ch in enumerate(self.vocab)} self.idx_to_char = {idx: ch for idx, ch in enumerate(self.vocab)} self.D = len(self.char_to_idx) print("Input has {} characters. Total input size: {}".format( len(self.vocab), len(self.text))) def CreateModel(self): log.debug("Start training") model = model_helper.ModelHelper(name="char_rnn") input_blob, seq_lengths, hidden_init, cell_init, target = \ model.net.AddExternalInputs( 'input_blob', 'seq_lengths', 'hidden_init', 'cell_init', 'target', ) hidden_output_all, self.hidden_output, _, self.cell_state = LSTM( model, input_blob, seq_lengths, (hidden_init, cell_init), self.D, self.hidden_size, scope="LSTM") output = brew.fc( model, hidden_output_all, None, dim_in=self.hidden_size, dim_out=self.D, axis=2 ) # axis is 2 as first two are T (time) and N (batch size). # We treat them as one big batch of size T * N softmax = model.net.Softmax(output, 'softmax', axis=2) softmax_reshaped, _ = model.net.Reshape( softmax, ['softmax_reshaped', '_'], shape=[-1, self.D]) # Create a copy of the current net. We will use it on the forward # pass where we don't need loss and backward operators self.forward_net = core.Net(model.net.Proto()) xent = model.net.LabelCrossEntropy([softmax_reshaped, target], 'xent') # Loss is average both across batch and through time # Thats why the learning rate below is multiplied by self.seq_length loss = model.net.AveragedLoss(xent, 'loss') model.AddGradientOperators([loss]) # use build_sdg function to build an optimizer build_sgd( model, base_learning_rate=0.1 * self.seq_length, policy="step", stepsize=1, gamma=0.9999 ) self.model = model self.predictions = softmax self.loss = loss self.prepare_state = core.Net("prepare_state") self.prepare_state.Copy(self.hidden_output, hidden_init) self.prepare_state.Copy(self.cell_state, cell_init) def _idx_at_pos(self, pos): return self.char_to_idx[self.text[pos]] def TrainModel(self): log.debug("Training model") workspace.RunNetOnce(self.model.param_init_net) # As though we predict the same probability for each character smooth_loss = -np.log(1.0 / self.D) * self.seq_length last_n_iter = 0 last_n_loss = 0.0 num_iter = 0 N = len(self.text) # We split text into batch_size pieces. Each piece will be used only # by a corresponding batch during the training process text_block_positions = np.zeros(self.batch_size, dtype=np.int32) text_block_size = N // self.batch_size text_block_starts = list(range(0, N, text_block_size)) text_block_sizes = [text_block_size] * self.batch_size text_block_sizes[self.batch_size - 1] += N % self.batch_size assert sum(text_block_sizes) == N # Writing to output states which will be copied to input # states within the loop below workspace.FeedBlob(self.hidden_output, np.zeros( [1, self.batch_size, self.hidden_size], dtype=np.float32 )) workspace.FeedBlob(self.cell_state, np.zeros( [1, self.batch_size, self.hidden_size], dtype=np.float32 )) workspace.CreateNet(self.prepare_state) # We iterate over text in a loop many times. Each time we peak # seq_length segment and feed it to LSTM as a sequence last_time = datetime.now() progress = 0 while True: workspace.FeedBlob( "seq_lengths", np.array([self.seq_length] * self.batch_size, dtype=np.int32) ) workspace.RunNet(self.prepare_state.Name()) input = np.zeros( [self.seq_length, self.batch_size, self.D] ).astype(np.float32) target = np.zeros( [self.seq_length * self.batch_size] ).astype(np.int32) for e in range(self.batch_size): for i in range(self.seq_length): pos = text_block_starts[e] + text_block_positions[e] input[i][e][self._idx_at_pos(pos)] = 1 target[i * self.batch_size + e] =\ self._idx_at_pos((pos + 1) % N) text_block_positions[e] = ( text_block_positions[e] + 1) % text_block_sizes[e] progress += 1 workspace.FeedBlob('input_blob', input) workspace.FeedBlob('target', target) CreateNetOnce(self.model.net) workspace.RunNet(self.model.net.Name()) num_iter += 1 last_n_iter += 1 if num_iter % self.iters_to_report == 0: new_time = datetime.now() print("Characters Per Second: {}". format( int(progress / (new_time - last_time).total_seconds()) )) print("Iterations Per Second: {}". format( int(self.iters_to_report / (new_time - last_time).total_seconds()) )) last_time = new_time progress = 0 print("{} Iteration {} {}". format('-' * 10, num_iter, '-' * 10)) loss = workspace.FetchBlob(self.loss) * self.seq_length smooth_loss = 0.999 * smooth_loss + 0.001 * loss last_n_loss += loss if num_iter % self.iters_to_report == 0: self.GenerateText(500, np.random.choice(self.vocab)) log.debug("Loss since last report: {}" .format(last_n_loss / last_n_iter)) log.debug("Smooth loss: {}".format(smooth_loss)) last_n_loss = 0.0 last_n_iter = 0 def GenerateText(self, num_characters, ch): # Given a starting symbol we feed a fake sequence of size 1 to # our RNN num_character times. After each time we use output # probabilities to pick a next character to feed to the network. # Same character becomes part of the output CreateNetOnce(self.forward_net) text = '' + ch for _i in range(num_characters): workspace.FeedBlob( "seq_lengths", np.array([1] * self.batch_size, dtype=np.int32)) workspace.RunNet(self.prepare_state.Name()) input = np.zeros([1, self.batch_size, self.D]).astype(np.float32) input[0][0][self.char_to_idx[ch]] = 1 workspace.FeedBlob("input_blob", input) workspace.RunNet(self.forward_net.Name()) p = workspace.FetchBlob(self.predictions) next = np.random.choice(self.D, p=p[0][0]) ch = self.idx_to_char[next] text += ch print(text) @utils.debug def main(): parser = argparse.ArgumentParser( description="Caffe2: Char RNN Training" ) parser.add_argument("--train_data", type=str, default=None, help="Path to training data in a text file format", required=True) parser.add_argument("--seq_length", type=int, default=25, help="One training example sequence length") parser.add_argument("--batch_size", type=int, default=1, help="Training batch size") parser.add_argument("--iters_to_report", type=int, default=500, help="How often to report loss and generate text") parser.add_argument("--hidden_size", type=int, default=100, help="Dimension of the hidden representation") parser.add_argument("--gpu", action="store_true", help="If set, training is going to use GPU 0") args = parser.parse_args() device = core.DeviceOption( workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 0) with core.DeviceScope(device): model = CharRNN(args) model.CreateModel() model.TrainModel() if __name__ == '__main__': workspace.GlobalInit(['caffe2', '--caffe2_log_level=2']) main()