Skip to content
Draft

BERT #30

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ data/*
*.bin
.travis/configs.hpp
Testing/*
.vscode/*
12 changes: 10 additions & 2 deletions models/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
project(models)

add_subdirectory(darknet)
add_subdirectory(yolo)
# Recurse into each model mlpack provides.
set(DIRS
darknet
bert
yolo
)

foreach(dir ${DIRS})
add_subdirectory(${dir})
endforeach()

# Add directory name to sources.
set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)
Expand Down
18 changes: 18 additions & 0 deletions models/bert/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
project(bert)

set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../")

set(SOURCES
bert.hpp
bert_impl.hpp
bert_tokenizer.hpp
bert_tokenizer_impl.hpp
)

foreach(file ${SOURCES})
set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
endforeach()

set(DIRS ${DIRS} ${DIR_SRCS} PARENT_SCOPE)
111 changes: 111 additions & 0 deletions models/bert/bert.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/**
* @file models/bert/bert.hpp
* @author Mrityunjay Tripathi
*
* Definition of the BERT (Bidirectional Encoder Representation for Transformers).
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/

#ifndef MODELS_BERT_BERT_HPP
#define MODELS_BERT_BERT_HPP

#include <mlpack/prereqs.hpp>
#include <mlpack/methods/ann/ffn.hpp>
#include <mlpack/methods/ann/layer/layer_types.hpp>
#include <mlpack/methods/ann/init_rules/glorot_init.hpp>

namespace mlpack {
namespace ann /** Artificial Neural Network. */ {

/**
* @tparam OutputLayerType Type of the last layer to be added to BERT model.
* @tparam InitializationRuleType Initilization Rule to be used to initialize
* parameters.
*/
template <
typename OutputLayerType = NegativeLogLikelihood<>,
typename InitializationRuleType = XavierInitialization
>
class BERT
{
public:
BERT();

/**
* Create the BERT object using the specified parameters.
*
* @param srcVocabSize The size of the vocabulary.
* @param srcSeqLen The source sequence length.
* @param numEncoderLayers The number of Transformer Encoder layers.
* @param dModel The dimensionality of the model.
* @param numHeads The number of attention heads.
* @param dropout The dropout rate.
* @param attentionMask The attention mask used to black-out future sequences.
* @param keyPaddingMask Blacks out specific tokens.
*/
BERT(const size_t srcVocabSize,
const size_t srcSeqLen,
const size_t numEncoderLayers = 12,
const size_t dModel = 512,
const size_t numHeads = 8,
const double dropout = 0.1,
const arma::mat& attentionMask = arma::mat(),
const arma::mat& keyPaddingMask = arma::mat());

/**
* Load the network from a local directory.
*
* @param filepath The location of the stored model.
*/
void LoadModel(const std::string& filepath);

/**
* Save the network locally.
*
* @param filepath The location where the model is to be saved.
*/
void SaveModel(const std::string& filepath);

private:
//! Locally-stored size of the vocabulary.
size_t srcVocabSize;

//! Locally-stored source sequence length.
size_t srcSeqLen;

//! Locally-stored number of Transformer Encoder blocks.
size_t numEncoderLayers;

//! Locally-stored dimensionality of the model.
size_t dModel;

//! Locally-stored number of attention heads.
size_t numHeads;

//! Locally-stored number of hidden units in FFN.
size_t dimFFN;

//! Locally-stored dropout rate.
double dropout;

//! Locally-stored attention mask.
arma::mat attentionMask;

//! Locally-stored key padding mask.
arma::mat keyPaddingMask;

//! Locally-stored complete decoder network.
FFN<OutputLayerType, InitializationRuleType> bert;
}; // class BERT

} // namespace ann
} // namespace mlpack

// Include implementation.
#include "bert_impl.hpp"

#endif
97 changes: 97 additions & 0 deletions models/bert/bert_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/**
* @file models/bert/bert_impl.hpp
* @author Mrityunjay Tripathi
*
* Implementation of the BERT (Bidirectional Encoder Representation for
* Transformers).
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/

#ifndef MODELS_BERT_BERT_IMPL_HPP
#define MODELS_BERT_BERT_IMPL_HPP

#include "bert.hpp"

namespace mlpack {
namespace ann /** Artificial Neural Network. */ {

template<typename OutputLayerType, typename InitializationRuleType>
BERT<OutputLayerType, InitializationRuleType>::BERT() :
srcVocabSize(0),
srcSeqLen(0),
numEncoderLayers(0),
dModel(0),
numHeads(0),
dimFFN(4 * dModel),
dropout(0.0)
{
// Nothing to do here.
}

template<typename OutputLayerType, typename InitializationRuleType>
BERT<OutputLayerType, InitializationRuleType>::BERT(
const size_t srcVocabSize,
const size_t srcSeqLen,
const size_t numEncoderLayers,
const size_t dModel,
const size_t numHeads,
const double dropout,
const arma::mat& attentionMask,
const arma::mat& keyPaddingMask) :
srcVocabSize(srcVocabSize),
srcSeqLen(srcSeqLen),
numEncoderLayers(numEncoderLayers),
dModel(dModel),
numHeads(numHeads),
dimFFN(4 * dModel),
dropout(dropout),
attentionMask(attentionMask),
keyPaddingMask(keyPaddingMask)
{
AddMerge<>* embedding = new AddMerge<>();
embedding->Add<Lookup<>>(vocabSize, dModel);
embedding->Add<Lookup<>>(3, dModel);

bert.Add(embedding);
bert.Add<PositionalEncoding<>>(dModel, srcSeqLen);
bert.Add<Dropout<>>(dropout);

mlpack::ann::TransformerEncoder<mlpack::ann::GELUFunction> encoder(
numEncoderLayers,
srcSeqLen,
dModel,
numHeads,
dimFFN,
dropout,
attentionMask,
keyPaddingMask);

bert.Add(encoder.Model());
}


template<typename OutputLayerType, typename InitializationRuleType>
void BERT<OutputLayerType, InitializationRuleType>::LoadModel(
const std::string& filepath)
{
data::Load(filepath, "BERT", bert);
std::cout << "Loaded model" << std::endl;
}

template<typename OutputLayerType, typename InitializationRuleType>
void BERT<OutputLayerType, InitializationRuleType>::SaveModel(
const std::string& filepath)
{
std::cout << "Saving model" << std::endl;
data::Save(filepath, "BERT", bert);
std::cout << "Model saved in " << filepath << std::endl;
}

} // namespace ann
} // namespace mlpack

#endif
125 changes: 125 additions & 0 deletions models/bert/bert_tokenizer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/**
* @file models/bert/bert_tokenizer.hpp
* @author Mrityunjay Tripathi
*
* Definition of the BERT Tokenizer.
*
* @code
* @article{Wolf2019HuggingFacesTS,
* title = {HuggingFace's Transformers: State-of-the-art Natural Language
* Processing},
* author = {Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond
* and Clement Delangue and Anthony Moi and Pierric Cistac and
* Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
* journal = {ArXiv},
* year = {2019},
* volume = {abs/1910.03771}
* }
* @endcode
*
* mlpack is free software; you may redistribute it and/or modify it under the
* terms of the 3-clause BSD license. You should have received a copy of the
* 3-clause BSD license along with mlpack. If not, see
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/

#ifndef MODELS_BERT_BERT_TOKENIZER_HPP
#define MODELS_BERT_BERT_TOKENIZER_HPP

#include <mlpack/prereqs.hpp>

namespace mlpack {
namespace ann /** Artificial Neural Network. */ {

class BertTokenizer
{
public:
/**
* Create a BertTokenizer object.
*/
BertTokenizer();

/**
* Create the TransformerDecoder object using the specified parameters.
*
* @param vocabFile Location of file containing the vocabulary.
* @param lowerCase Whether to turn each token to lower case.
* @param basicTokenize Whether to do basic tokenization before WordPiece.
* @param neverSplit Tokens which will never be split during tokenization.
* Only has an effect when basicTokenize = true.
* @param unkSplit The unknown token. A token that is not in the vocabulary
* cannot be converted to an ID and is set to be this token instead.
* @param sepToken The separator token. It is used when building a sequence
* from multiple sequences, e.g. two sequences for sequence
* classification or for a text and a question for question answering.
* It is also used as the last token of a sequence built with special
* tokens.
* @param padToken The token used for padding, for example when batching
* sequences of different lengths.
* @param clsToken The classifier token which is used when doing sequence
* classification (classification of the whole sequence instead of
* per-token classification). It is the first token of the sequence
* when built with special tokens.
* @param maskToken The token used for masking values. This is the token used
* when training this model with masked language modeling. This is the
* token which the model will try to predict.
*/
BertTokenizer(const std::string vocabFile,
const bool lowerCase = true,
const bool basicTokenize = true,
const std::vector<std::string> neverSplit = std::vector<std::string>(),
const std::string unkToken = "[UNK]",
const std::string sepToken = "[SEP]",
const std::string padToken = "[PAD]",
const std::string clsToken = "[CLS]",
const std::string maskToken = "[MASK]");

private:
//! Location of vocabulary.
std::string vocabFile;

//! Locally-stored vocabulary.
std::vector<std::string> vocabulary;

//! Whether to turn each token to lower case.
bool lowerCase;

//! Whether to do basic tokenization before WordPiece.
bool basicTokenize;

//! Tokens which will never be split during tokenization. Only has an effect
//! when basicTokenize = true.
std::vector<std::string> neverSplit;

//! The unknown token. A token that is not in the vocabulary cannot be
//! converted to an ID and is set to be this token instead.
std::string unkToken;

//! The separator token. It is used when building a sequence from multiple
//! sequences, e.g. two sequences for sequence classification or for a text
//! and a question for question answering. It is also used as the last token
//! of a sequence built with special tokens.
std::string sepToken;

//! The token used for padding, for example when batching sequences of
//! different lengths.
std::string padToken;

//! The classifier token which is used when doing sequence classification
//! (classification of the whole sequence instead of per-token
//! classification). It is the first token of the sequence when built with
//! special tokens.
std::string clsToken;

//! The token used for masking values. This is the token used when training
//! this model with masked language modeling. This is the token which the
//! model will try to predict.
std::string maskToken;
}; // class BertTokenizer

// Include implementation.
#include "bert_tokenizer_impl.hpp"
} // namespace ann
} // namespace mlpack

#endif
Loading