koc-lab
diff --git a/‎configs/base.py
Lines changed: 121 additions & 0 deletions b/‎configs/base.py
Lines changed: 121 additions & 0 deletions
diff --git a/‎configs/classification.py
Lines changed: 70 additions & 0 deletions b/‎configs/classification.py
Lines changed: 70 additions & 0 deletions
diff --git a/‎configs/pretraining.py
Lines changed: 80 additions & 0 deletions b/‎configs/pretraining.py
Lines changed: 80 additions & 0 deletions
diff --git a/‎dfrt.py
Lines changed: 109 additions & 0 deletions b/‎dfrt.py
Lines changed: 109 additions & 0 deletions
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Updated for Fractional Fourier Transform """
+
+"""Base template config for pre-training and fine-tuning."""
+
+import enum
+import ml_collections
+
+
+class ModelArchitecture(enum.Enum):
+  """Determines model architecture - in particular, the mixing layer."""
+  BERT = 'bert'
+  FRAC_NET = 'frac_net' # Fractional Fourier Transform mixing
+  F_NET = 'f_net'  # Fourier Transform mixing
+  FF_ONLY = 'ff_only'  # Feed forward sublayers only; no token mixing
+  LINEAR = 'linear'  # Matrix multiplications with learnable weights
+  RANDOM = 'random'  # Constant, random matrix multiplications
+
+
+class TrainingMode(str, enum.Enum):
+  """Determines type of training."""
+  PRETRAINING = 'pretraining'
+  CLASSIFICATION = 'classification'
+
+
+class HybridAttentionLayout(str, enum.Enum):
+  """Where, in hybrid models, attention sublayers replace mixing sublayers."""
+  BOTTOM = 'bottom'  # First mixing sublayers.
+  MIDDLE = 'middle'  # Middle mixing sublayers.
+  MIXED = 'mixed'  # Interspersed throughout model.
+  TOP = 'top'  # Final mixing sublayers.
+
+
+def get_config():
+  """Base config for training models."""
+  config = ml_collections.ConfigDict()
+
+  # Determines which model to use.
+  # Specific mixing sublayers may be replaced with attention using
+  # config.attention_layout and config.num_attention_layers.
+  config.model_arch: ModelArchitecture = ModelArchitecture.FRAC_NET
+
+  # How often to save the model checkpoint.
+  config.save_checkpoints_steps: int = 1000
+  # Frequency fo eval during training, e.g. every 1000 steps.
+  config.eval_frequency: int = 1000
+
+  # Total batch size for training.
+  config.train_batch_size: int = 32
+  # Total batch size for eval.
+  config.eval_batch_size: int = 8
+
+  # The base learning rate for Adam.
+  config.learning_rate: float = 1e-4
+
+  # Initial checkpoint directory or filepath (usually from a pre-trained model).
+  config.init_checkpoint_dir: str = ''
+
+  # Whether to lower case the input text. Should be True for uncased models and
+  # False for cased models.
+  config.do_lower_case: bool = True
+
+  # Model parameters.
+
+  # For pre-training, we only need 2 segment types (for NSP), but we allow up to
+  # 4 for GLUE/SuperGLUE fine-tuning.
+  config.type_vocab_size: int = 4
+  # Embedding dimension for each token.
+  config.d_emb: int = 768
+  # Hidden dimension of model.
+  config.d_model: int = 768
+  # Hidden dimension for feed-forward layer.
+  config.d_ff: int = 3072
+  # The maximum total input sequence length after tokenization. Sequences longer
+  # than this will be truncated, and sequences shorter than this will be padded.
+  config.max_seq_length: int = 512
+  # Number of self-attention heads. Only used for BERT models.
+  config.num_heads: int = 12
+  # Number of model blocks / layers.
+  config.num_layers: int = 12
+  # Regular dropout rate, applied throughout model.
+  config.dropout_rate: float = 0.1
+  # Dropout rate used in mixing module, e.g. self-attention sublayer.
+  config.mixing_dropout_rate: float = 0.1
+
+  # Determines whether or not the FFT is used in lieu of matrix multiplications.
+  # Only relevant for FNet,  If true, favor FFT over matrix multiplications to
+  # compute the DFT.
+  config.use_fft: bool = True
+  
+  # Only relevant for FRAC_NET or FrFNet: Determines the fractional order of fourier transform
+  config.frac_order: float = 0
+  
+  # For hybrid models, attention layers replace a subset of the mixing
+  # sublayers.
+  config.attention_layout: HybridAttentionLayout = HybridAttentionLayout.TOP
+  config.num_attention_layers: int = 0
+
+  # Random number generator seed.
+  config.seed: int = 0
+
+  # Dummy parameter for repeated runs.
+  config.trial: int = 0
+
+  return config
+
+
@@ -0,0 +1,70 @@
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Updated for Fractional Fourier Transform """
+
+"""Config for fine-tuning on the GLUE and SuperGLUE benchmarks."""
+
+from configs import base as base_config
+from configs.base import ModelArchitecture
+from configs.base import TrainingMode
+
+
+def get_config():
+  """Config for fine-tuning (classification)."""
+  config = base_config.get_config()
+
+  # Determines which model to use.
+  config.model_arch: ModelArchitecture = ModelArchitecture.FRAC_NET
+
+  config.mode: TrainingMode = TrainingMode.CLASSIFICATION
+
+  # This is either "glue/DS_g", where DS_g is one of the following:
+  # [cola, sst2, mrpc, qqp, stsb, mnli, qnli, rte, wnli].
+  config.dataset_name: str = "glue/cola"
+
+  # How often to save the model checkpoint.
+  config.save_checkpoints_steps: int = 200
+  # Training metrics will be computed (1 / eval_proportion) times during
+  # training at regularly spaced intervals, regardless of dataset size.
+  config.eval_proportion: float = 0.05
+
+  # Total batch size for training.
+  config.train_batch_size: int = 64
+  # Total batch size for eval (and predictions).
+  config.eval_batch_size: int = 32
+
+  # The base learning rate for Adam.
+  config.learning_rate: float = 1e-5
+
+  # Total number of training epochs to perform.
+  config.num_train_epochs: float = 3
+  # Proportion of training to perform linear learning rate warmup for.
+  # E.g., 0.1 = 10% of training steps.
+  config.warmup_proportion: float = 0.1
+
+  # Maximum number of eval steps on validation split. Actual number of step may
+  # be less for small eval datasets.
+  config.max_num_eval_steps: int = int(1e5)
+
+  # Initial checkpoint directory or filepath (usually from a pre-trained model).
+  config.init_checkpoint_dir: str = ""
+
+  # Dummy attribute for repeated runs.
+  config.trial: int = 0
+
+  return config
+
+
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Updated for Fractional Fourier Transform """
+
+"""Config for pre-training on the C4 or Wikipedia dataset."""
+
+from typing import Optional
+
+from configs import base as base_config
+from configs.base import ModelArchitecture
+from configs.base import TrainingMode
+
+
+def get_config():
+  """Config for pre-training."""
+  config = base_config.get_config()
+
+  # Determines which model to use.
+  config.model_arch: ModelArchitecture = ModelArchitecture.FRAC_NET
+
+  config.mode: TrainingMode = TrainingMode.PRETRAINING
+
+  # Total batch size for training.
+  config.train_batch_size: int = 32
+  # Total batch size for eval.
+  config.eval_batch_size: int = 32
+
+  # The base learning rate for Adam.
+  config.learning_rate: float = 1e-4
+  # If set, determines how much to clip the gradient during training.
+  config.clipped_grad_norm: Optional[float] = None
+
+  # Number of training steps.
+  config.num_train_steps: int = int(1e6)
+  # Number of warm-up steps. We generally find that that larger models need more
+  # warm-up steps.
+  config.num_warmup_steps: int = int(1e4)
+
+  # How often to save the model checkpoint.
+  config.save_checkpoints_steps: int = 2000
+  # Frequency fo eval during training, e.g. every 2000 steps.
+  config.eval_frequency: int = 2000
+
+  # Maximum number of eval steps.
+  config.max_num_eval_steps: int = 100
+
+  # Do not start from a pre-trained checkpoint.
+  config.init_checkpoint_dir: str = ''
+
+  # Maximum number of masked LM predictions per sequence.
+  config.max_predictions_per_seq: int = 80
+  # Proportion of tokens for masked LM predictions. Total number of selected
+  # tokens will be at most config.max_predictions_per_seq.
+  config.masking_rate: float = 0.15
+  # Proportion of masked tokens to replace with ['MASK'].
+  config.mask_token_proportion: float = 0.8
+  # Proportion of masked tokens to replace with a random token.
+  config.random_token_proportion: float = 0.1
+  # Remaining 1 - config.mask_token_proportion - config.random_token_proportion
+  # fraction of selected tokens are left as is.
+
+  # Dummy attribute for repeated runs.
+  config.trial: int = 0
+
+  return config
+
+
@@ -0,0 +1,109 @@
+"""
+Script of calculation Discrete Fractional Fourier Transform matrices for a given 
+dimension N and fraction value a
+"""
+
+import numpy as np
+from scipy import linalg
+import math
+
+def dfrtmtrx(N, a):
+    # Approximation order
+    app_ord = 2
+    Evec = _dis_s(N,app_ord)
+    
+    even = 1 - (N%2)
+    
+    l = np.array(list(range(0,N-1)) + [N-1+even])
+    
+    f = np.diag(np.exp(-1j*math.pi/2*a*l))
+    
+    F= N**(1/2)*np.einsum("ij,jk,ni->nk", f, Evec.T, Evec, optimize=True)
+    
+    return F
+
+def _dis_s(N,app_ord):
+    
+    S = _creates(N,app_ord)
+    
+    p = N
+    r = math.floor(N/2)
+    P = np.zeros((p,p))
+    
+    P[0,0] = 1
+    even = 1 - (p%2)
+    
+    for i in range(1,r-even+1):
+        P[i,i] = 1/(2**(1/2))
+        P[i,p-i] = 1/(2**(1/2))
+        
+    if even:
+        P[r,r] = 1
+        
+    for i in range(r+1,p):
+        P[i,i] = -1/(2**(1/2))
+        P[i,p-i] = 1/(2**(1/2))
+    
+    
+    CS = np.einsum("ij,jk,ni->nk", S, P.T, P, optimize=True)
+    
+    C2 = CS[0:math.floor(N/2+1), 0:math.floor(N/2+1)]
+    S2 = CS[math.floor(N/2+1):N, math.floor(N/2+1):N]
+    
+    ec, vc = linalg.eig(C2)
+    # idx = np.argsort(ec)
+    # ec = ec[idx]
+    # vc = vc[:,idx]
+    
+    es, vs = linalg.eig(S2)
+    # idx = np.argsort(es)
+    # es = es[idx]
+    # vs = vs[:,idx]
+    
+    qvc = np.vstack((vc, np.zeros([math.ceil(N/2-1), math.floor(N/2+1)])))
+    SC2 = P@qvc # Even Eigenvector of S
+    
+    qvs = np.vstack((np.zeros([math.floor(N/2+1), math.ceil(N/2-1)]),vs))
+    SS2 = P@qvs # Odd Eigenvector of S
+    
+    idx = np.argsort(-ec)
+    SC2 = SC2[:,idx]
+    
+    idx = np.argsort(-es)
+    SS2 = SS2[:,idx]
+    
+    if N%2 == 0:
+        S2C2 = np.zeros([N,N+1])
+        SS2 = np.hstack([SS2, np.zeros((SS2.shape[0],1))])
+        S2C2[:,range(0,N+1,2)] = SC2;
+        S2C2[:,range(1,N,2)] = SS2
+        S2C2 = np.delete(S2C2, (N-1), axis=1)
+    else:
+        S2C2 = np.zeros([N,N])
+        S2C2[:,range(0,N+1,2)] = SC2;
+        S2C2[:,range(1,N,2)] = SS2
+    
+    Evec = S2C2 
+    
+    return Evec
+    
+def _creates(N,app_ord):
+    # Creates S matrix of approximation order ord
+    # When ord=1, elementary S matrix is returned
+    
+    app_ord = int(app_ord / 2) 
+
+    s = np.concatenate((np.array([0, 1]), np.zeros(N-1-2*app_ord), np.array([1])))
+    S = _cconvm(N,s) + np.diag((np.fft.fft(s)).real);
+    
+    return S
+    
+def _cconvm(N,s):
+    # Generates circular Convm matrix
+    M = np.zeros((N,N))
+    dum = s
+    for i in range(N):
+        M[:,i] = dum
+        dum = np.roll(dum,1)
+        
+    return M