學習率排程#
學習率被認為是訓練深度神經網路最重要的超參數之一,但要選擇它可能相當困難。不僅只使用固定的學習率,通常會使用學習率排程器。在此範例中,我們將使用餘弦排程器。在餘弦排程器開始發揮作用前,我們會先從所謂的熱身階段開始,在warmup_epochs
個 epoch 中,學習率會線性增加。如需有關餘弦排程器的更多資訊,請查看論文 「SGDR:帶有重新啟動的隨機梯度下降法」。
我們將向您說明如何…
定義學習率排程
使用該排程訓練簡易模型
def create_learning_rate_fn(config, base_learning_rate, steps_per_epoch):
"""Creates learning rate schedule."""
warmup_fn = optax.linear_schedule(
init_value=0., end_value=base_learning_rate,
transition_steps=config.warmup_epochs * steps_per_epoch)
cosine_epochs = max(config.num_epochs - config.warmup_epochs, 1)
cosine_fn = optax.cosine_decay_schedule(
init_value=base_learning_rate,
decay_steps=cosine_epochs * steps_per_epoch)
schedule_fn = optax.join_schedules(
schedules=[warmup_fn, cosine_fn],
boundaries=[config.warmup_epochs * steps_per_epoch])
return schedule_fn
若要使用排程,我們必須透過將超參數傳遞至create_learning_rate_fn
函數來建立學習率函數,然後將函數傳遞至您的Optax
最佳化器。例如,在 MNIST 上使用此一排程需要變更train_step
函數
@jax.jit
def train_step(state, batch):
def loss_fn(params):
logits = CNN().apply({'params': params}, batch['image'])
one_hot = jax.nn.one_hot(batch['label'], 10)
loss = jnp.mean(optax.softmax_cross_entropy(logits, one_hot))
return loss, logits
grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
(_, logits), grads = grad_fn(state.params)
new_state = state.apply_gradients(grads=grads)
metrics = compute_metrics(logits, batch['label'])
return new_state, metrics
@functools.partial(jax.jit, static_argnums=2)
def train_step(state, batch, learning_rate_fn):
def loss_fn(params):
logits = CNN().apply({'params': params}, batch['image'])
one_hot = jax.nn.one_hot(batch['label'], 10)
loss = jnp.mean(optax.softmax_cross_entropy(logits, one_hot))
return loss, logits
grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
(_, logits), grads = grad_fn(state.params)
new_state = state.apply_gradients(grads=grads)
metrics = compute_metrics(logits, batch['label'])
lr = learning_rate_fn(state.step)
metrics['learning_rate'] = lr
return new_state, metrics
以及 train_epoch
函數
def train_epoch(state, train_ds, batch_size, epoch, rng):
"""Trains for a single epoch."""
train_ds_size = len(train_ds['image'])
steps_per_epoch = train_ds_size // batch_size
perms = jax.random.permutation(rng, len(train_ds['image']))
perms = perms[:steps_per_epoch * batch_size]
perms = perms.reshape((steps_per_epoch, batch_size))
batch_metrics = []
for perm in perms:
batch = {k: v[perm, ...] for k, v in train_ds.items()}
state, metrics = train_step(state, batch)
batch_metrics.append(metrics)
# compute mean of metrics across each batch in epoch.
batch_metrics = jax.device_get(batch_metrics)
epoch_metrics = {
k: np.mean([metrics[k] for metrics in batch_metrics])
for k in batch_metrics[0]}
logging.info('train epoch: %d, loss: %.4f, accuracy: %.2f', epoch,
epoch_metrics['loss'], epoch_metrics['accuracy'] * 100)
return state, epoch_metrics
def train_epoch(state, train_ds, batch_size, epoch, learning_rate_fn, rng):
"""Trains for a single epoch."""
train_ds_size = len(train_ds['image'])
steps_per_epoch = train_ds_size // batch_size
perms = jax.random.permutation(rng, len(train_ds['image']))
perms = perms[:steps_per_epoch * batch_size]
perms = perms.reshape((steps_per_epoch, batch_size))
batch_metrics = []
for perm in perms:
batch = {k: v[perm, ...] for k, v in train_ds.items()}
state, metrics = train_step(state, batch, learning_rate_fn)
batch_metrics.append(metrics)
# compute mean of metrics across each batch in epoch.
batch_metrics = jax.device_get(batch_metrics)
epoch_metrics = {
k: np.mean([metrics[k] for metrics in batch_metrics])
for k in batch_metrics[0]}
logging.info('train epoch: %d, loss: %.4f, accuracy: %.2f', epoch,
epoch_metrics['loss'], epoch_metrics['accuracy'] * 100)
return state, epoch_metrics
以及 create_train_state
函數
def create_train_state(rng, config):
"""Creates initial `TrainState`."""
cnn = CNN()
params = cnn.init(rng, jnp.ones([1, 28, 28, 1]))['params']
tx = optax.sgd(config.learning_rate, config.momentum)
return train_state.TrainState.create(
apply_fn=cnn.apply, params=params, tx=tx)
def create_train_state(rng, config, learning_rate_fn):
"""Creates initial `TrainState`."""
cnn = CNN()
params = cnn.init(rng, jnp.ones([1, 28, 28, 1]))['params']
tx = optax.sgd(learning_rate_fn, config.momentum)
return train_state.TrainState.create(
apply_fn=cnn.apply, params=params, tx=tx)