Skip to content

Commit 298efdc

Browse files
Add files via upload
1 parent 35b2d90 commit 298efdc

File tree

12 files changed

+2845
-0
lines changed

12 files changed

+2845
-0
lines changed

configs/base.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# coding=utf-8
2+
# Copyright 2021 The Google Research Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
""" Updated for Fractional Fourier Transform """
17+
18+
"""Base template config for pre-training and fine-tuning."""
19+
20+
import enum
21+
import ml_collections
22+
23+
24+
class ModelArchitecture(enum.Enum):
25+
"""Determines model architecture - in particular, the mixing layer."""
26+
BERT = 'bert'
27+
FRAC_NET = 'frac_net' # Fractional Fourier Transform mixing
28+
F_NET = 'f_net' # Fourier Transform mixing
29+
FF_ONLY = 'ff_only' # Feed forward sublayers only; no token mixing
30+
LINEAR = 'linear' # Matrix multiplications with learnable weights
31+
RANDOM = 'random' # Constant, random matrix multiplications
32+
33+
34+
class TrainingMode(str, enum.Enum):
35+
"""Determines type of training."""
36+
PRETRAINING = 'pretraining'
37+
CLASSIFICATION = 'classification'
38+
39+
40+
class HybridAttentionLayout(str, enum.Enum):
41+
"""Where, in hybrid models, attention sublayers replace mixing sublayers."""
42+
BOTTOM = 'bottom' # First mixing sublayers.
43+
MIDDLE = 'middle' # Middle mixing sublayers.
44+
MIXED = 'mixed' # Interspersed throughout model.
45+
TOP = 'top' # Final mixing sublayers.
46+
47+
48+
def get_config():
49+
"""Base config for training models."""
50+
config = ml_collections.ConfigDict()
51+
52+
# Determines which model to use.
53+
# Specific mixing sublayers may be replaced with attention using
54+
# config.attention_layout and config.num_attention_layers.
55+
config.model_arch: ModelArchitecture = ModelArchitecture.FRAC_NET
56+
57+
# How often to save the model checkpoint.
58+
config.save_checkpoints_steps: int = 1000
59+
# Frequency fo eval during training, e.g. every 1000 steps.
60+
config.eval_frequency: int = 1000
61+
62+
# Total batch size for training.
63+
config.train_batch_size: int = 32
64+
# Total batch size for eval.
65+
config.eval_batch_size: int = 8
66+
67+
# The base learning rate for Adam.
68+
config.learning_rate: float = 1e-4
69+
70+
# Initial checkpoint directory or filepath (usually from a pre-trained model).
71+
config.init_checkpoint_dir: str = ''
72+
73+
# Whether to lower case the input text. Should be True for uncased models and
74+
# False for cased models.
75+
config.do_lower_case: bool = True
76+
77+
# Model parameters.
78+
79+
# For pre-training, we only need 2 segment types (for NSP), but we allow up to
80+
# 4 for GLUE/SuperGLUE fine-tuning.
81+
config.type_vocab_size: int = 4
82+
# Embedding dimension for each token.
83+
config.d_emb: int = 768
84+
# Hidden dimension of model.
85+
config.d_model: int = 768
86+
# Hidden dimension for feed-forward layer.
87+
config.d_ff: int = 3072
88+
# The maximum total input sequence length after tokenization. Sequences longer
89+
# than this will be truncated, and sequences shorter than this will be padded.
90+
config.max_seq_length: int = 512
91+
# Number of self-attention heads. Only used for BERT models.
92+
config.num_heads: int = 12
93+
# Number of model blocks / layers.
94+
config.num_layers: int = 12
95+
# Regular dropout rate, applied throughout model.
96+
config.dropout_rate: float = 0.1
97+
# Dropout rate used in mixing module, e.g. self-attention sublayer.
98+
config.mixing_dropout_rate: float = 0.1
99+
100+
# Determines whether or not the FFT is used in lieu of matrix multiplications.
101+
# Only relevant for FNet, If true, favor FFT over matrix multiplications to
102+
# compute the DFT.
103+
config.use_fft: bool = True
104+
105+
# Only relevant for FRAC_NET or FrFNet: Determines the fractional order of fourier transform
106+
config.frac_order: float = 0
107+
108+
# For hybrid models, attention layers replace a subset of the mixing
109+
# sublayers.
110+
config.attention_layout: HybridAttentionLayout = HybridAttentionLayout.TOP
111+
config.num_attention_layers: int = 0
112+
113+
# Random number generator seed.
114+
config.seed: int = 0
115+
116+
# Dummy parameter for repeated runs.
117+
config.trial: int = 0
118+
119+
return config
120+
121+

configs/classification.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# coding=utf-8
2+
# Copyright 2021 The Google Research Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
""" Updated for Fractional Fourier Transform """
17+
18+
"""Config for fine-tuning on the GLUE and SuperGLUE benchmarks."""
19+
20+
from configs import base as base_config
21+
from configs.base import ModelArchitecture
22+
from configs.base import TrainingMode
23+
24+
25+
def get_config():
26+
"""Config for fine-tuning (classification)."""
27+
config = base_config.get_config()
28+
29+
# Determines which model to use.
30+
config.model_arch: ModelArchitecture = ModelArchitecture.FRAC_NET
31+
32+
config.mode: TrainingMode = TrainingMode.CLASSIFICATION
33+
34+
# This is either "glue/DS_g", where DS_g is one of the following:
35+
# [cola, sst2, mrpc, qqp, stsb, mnli, qnli, rte, wnli].
36+
config.dataset_name: str = "glue/cola"
37+
38+
# How often to save the model checkpoint.
39+
config.save_checkpoints_steps: int = 200
40+
# Training metrics will be computed (1 / eval_proportion) times during
41+
# training at regularly spaced intervals, regardless of dataset size.
42+
config.eval_proportion: float = 0.05
43+
44+
# Total batch size for training.
45+
config.train_batch_size: int = 64
46+
# Total batch size for eval (and predictions).
47+
config.eval_batch_size: int = 32
48+
49+
# The base learning rate for Adam.
50+
config.learning_rate: float = 1e-5
51+
52+
# Total number of training epochs to perform.
53+
config.num_train_epochs: float = 3
54+
# Proportion of training to perform linear learning rate warmup for.
55+
# E.g., 0.1 = 10% of training steps.
56+
config.warmup_proportion: float = 0.1
57+
58+
# Maximum number of eval steps on validation split. Actual number of step may
59+
# be less for small eval datasets.
60+
config.max_num_eval_steps: int = int(1e5)
61+
62+
# Initial checkpoint directory or filepath (usually from a pre-trained model).
63+
config.init_checkpoint_dir: str = ""
64+
65+
# Dummy attribute for repeated runs.
66+
config.trial: int = 0
67+
68+
return config
69+
70+

configs/pretraining.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# coding=utf-8
2+
# Copyright 2021 The Google Research Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
""" Updated for Fractional Fourier Transform """
17+
18+
"""Config for pre-training on the C4 or Wikipedia dataset."""
19+
20+
from typing import Optional
21+
22+
from configs import base as base_config
23+
from configs.base import ModelArchitecture
24+
from configs.base import TrainingMode
25+
26+
27+
def get_config():
28+
"""Config for pre-training."""
29+
config = base_config.get_config()
30+
31+
# Determines which model to use.
32+
config.model_arch: ModelArchitecture = ModelArchitecture.FRAC_NET
33+
34+
config.mode: TrainingMode = TrainingMode.PRETRAINING
35+
36+
# Total batch size for training.
37+
config.train_batch_size: int = 32
38+
# Total batch size for eval.
39+
config.eval_batch_size: int = 32
40+
41+
# The base learning rate for Adam.
42+
config.learning_rate: float = 1e-4
43+
# If set, determines how much to clip the gradient during training.
44+
config.clipped_grad_norm: Optional[float] = None
45+
46+
# Number of training steps.
47+
config.num_train_steps: int = int(1e6)
48+
# Number of warm-up steps. We generally find that that larger models need more
49+
# warm-up steps.
50+
config.num_warmup_steps: int = int(1e4)
51+
52+
# How often to save the model checkpoint.
53+
config.save_checkpoints_steps: int = 2000
54+
# Frequency fo eval during training, e.g. every 2000 steps.
55+
config.eval_frequency: int = 2000
56+
57+
# Maximum number of eval steps.
58+
config.max_num_eval_steps: int = 100
59+
60+
# Do not start from a pre-trained checkpoint.
61+
config.init_checkpoint_dir: str = ''
62+
63+
# Maximum number of masked LM predictions per sequence.
64+
config.max_predictions_per_seq: int = 80
65+
# Proportion of tokens for masked LM predictions. Total number of selected
66+
# tokens will be at most config.max_predictions_per_seq.
67+
config.masking_rate: float = 0.15
68+
# Proportion of masked tokens to replace with ['MASK'].
69+
config.mask_token_proportion: float = 0.8
70+
# Proportion of masked tokens to replace with a random token.
71+
config.random_token_proportion: float = 0.1
72+
# Remaining 1 - config.mask_token_proportion - config.random_token_proportion
73+
# fraction of selected tokens are left as is.
74+
75+
# Dummy attribute for repeated runs.
76+
config.trial: int = 0
77+
78+
return config
79+
80+

dfrt.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
"""
2+
Script of calculation Discrete Fractional Fourier Transform matrices for a given
3+
dimension N and fraction value a
4+
"""
5+
6+
import numpy as np
7+
from scipy import linalg
8+
import math
9+
10+
def dfrtmtrx(N, a):
11+
# Approximation order
12+
app_ord = 2
13+
Evec = _dis_s(N,app_ord)
14+
15+
even = 1 - (N%2)
16+
17+
l = np.array(list(range(0,N-1)) + [N-1+even])
18+
19+
f = np.diag(np.exp(-1j*math.pi/2*a*l))
20+
21+
F= N**(1/2)*np.einsum("ij,jk,ni->nk", f, Evec.T, Evec, optimize=True)
22+
23+
return F
24+
25+
def _dis_s(N,app_ord):
26+
27+
S = _creates(N,app_ord)
28+
29+
p = N
30+
r = math.floor(N/2)
31+
P = np.zeros((p,p))
32+
33+
P[0,0] = 1
34+
even = 1 - (p%2)
35+
36+
for i in range(1,r-even+1):
37+
P[i,i] = 1/(2**(1/2))
38+
P[i,p-i] = 1/(2**(1/2))
39+
40+
if even:
41+
P[r,r] = 1
42+
43+
for i in range(r+1,p):
44+
P[i,i] = -1/(2**(1/2))
45+
P[i,p-i] = 1/(2**(1/2))
46+
47+
48+
CS = np.einsum("ij,jk,ni->nk", S, P.T, P, optimize=True)
49+
50+
C2 = CS[0:math.floor(N/2+1), 0:math.floor(N/2+1)]
51+
S2 = CS[math.floor(N/2+1):N, math.floor(N/2+1):N]
52+
53+
ec, vc = linalg.eig(C2)
54+
# idx = np.argsort(ec)
55+
# ec = ec[idx]
56+
# vc = vc[:,idx]
57+
58+
es, vs = linalg.eig(S2)
59+
# idx = np.argsort(es)
60+
# es = es[idx]
61+
# vs = vs[:,idx]
62+
63+
qvc = np.vstack((vc, np.zeros([math.ceil(N/2-1), math.floor(N/2+1)])))
64+
SC2 = P@qvc # Even Eigenvector of S
65+
66+
qvs = np.vstack((np.zeros([math.floor(N/2+1), math.ceil(N/2-1)]),vs))
67+
SS2 = P@qvs # Odd Eigenvector of S
68+
69+
idx = np.argsort(-ec)
70+
SC2 = SC2[:,idx]
71+
72+
idx = np.argsort(-es)
73+
SS2 = SS2[:,idx]
74+
75+
if N%2 == 0:
76+
S2C2 = np.zeros([N,N+1])
77+
SS2 = np.hstack([SS2, np.zeros((SS2.shape[0],1))])
78+
S2C2[:,range(0,N+1,2)] = SC2;
79+
S2C2[:,range(1,N,2)] = SS2
80+
S2C2 = np.delete(S2C2, (N-1), axis=1)
81+
else:
82+
S2C2 = np.zeros([N,N])
83+
S2C2[:,range(0,N+1,2)] = SC2;
84+
S2C2[:,range(1,N,2)] = SS2
85+
86+
Evec = S2C2
87+
88+
return Evec
89+
90+
def _creates(N,app_ord):
91+
# Creates S matrix of approximation order ord
92+
# When ord=1, elementary S matrix is returned
93+
94+
app_ord = int(app_ord / 2)
95+
96+
s = np.concatenate((np.array([0, 1]), np.zeros(N-1-2*app_ord), np.array([1])))
97+
S = _cconvm(N,s) + np.diag((np.fft.fft(s)).real);
98+
99+
return S
100+
101+
def _cconvm(N,s):
102+
# Generates circular Convm matrix
103+
M = np.zeros((N,N))
104+
dum = s
105+
for i in range(N):
106+
M[:,i] = dum
107+
dum = np.roll(dum,1)
108+
109+
return M

0 commit comments

Comments
 (0)