CLT Application
This section explains the internal mechanics of the CLT applet. It includes accepted data formats, the model architecture used, training details, and deployment instructions for using the .bin
model file after download.
Data Processing Details
Ensuring your data is in the correct format is crucial for the CLT applet to function properly. The applet supports six different formats:
Folder with 2 txt files
Use a folder containing input.txt
and output.txt
, where each line in input.txt
corresponds to a label in output.txt
.
input.txt
cat
dog
output.txt
0
1
TXT file
A .txt
file can also be used, where each line contains an input-output pair separated by a comma.
cat, 0
dog, 1
JSON file
The file should contain a list of objects, each with input
and output
fields.
[
{
"input": "cat",
"output": "0"
},
{
"input": "dog",
"output": "1"
}
]
CSV file
Ensure the first row contains headers: input
and output
.
input,output
cat, 0
dog, 1
XML file
Use <input>
and <output>
tags inside <entry>
elements.
<qaPairs>
<entry>
<input>cat</input>
<output>0</output>
</entry>
</qaPairs>
YAML file
Each entry should include input
and output
fields.
- input: "cat"
output: "0"
Tokenization
This pipeline tokenizes text data dpeending on characters’ occurance or word occurance. You can choose between the two types of tokenization
import string
class CharTokenizer:
def __init__(self, max_len=100):
# Define the character set, adding special tokens if necessary
self.characters = string.ascii_letters + string.digits + string.punctuation + " \n\t"
self.char_to_index = {char: idx for idx, char in enumerate(self.characters)}
self.index_to_char = {idx: char for char, idx in self.char_to_index.items()}
self.max_len = max_len # Maximum length of the sequence
def build_vocab(self, corpus = None, min_freq = None):
pass
def encode(self, text):
# Encode the text into indices
encoded = [self.char_to_index.get(char, self.char_to_index[' ']) for char in text]
# Pad or truncate to the max_len
if len(encoded) < self.max_len:
encoded.extend([0] * (self.max_len - len(encoded))) # Padding with 0
else:
encoded = encoded[:self.max_len] # Truncate to max_len
return torch.tensor(encoded)
def decode(self, indices):
# Decode the indices back to the original text
return ''.join([self.index_to_char.get(idx, '') for idx in indices])
import torch
import string
from collections import Counter
class VocabTokenizer:
def __init__(self, corpus=None, max_len=100, min_freq=1):
"""
Initializes the tokenizer with a vocabulary built from the given corpus.
:param corpus: List of texts to build vocabulary from (optional).
:param max_len: Maximum sequence length.
:param min_freq: Minimum frequency for a token to be included in the vocabulary.
"""
self.max_len = max_len
self.special_tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]
if corpus:
self.build_vocab(corpus, min_freq)
else:
self.word_to_index = {tok: idx for idx, tok in enumerate(self.special_tokens)}
self.index_to_word = {idx: tok for tok, idx in self.word_to_index.items()}
def build_vocab(self, corpus, min_freq):
"""
Builds a vocabulary from a corpus, filtering words with frequency < min_freq.
"""
word_counts = Counter()
for text in corpus:
word_counts.update(text.split())
vocab = [word for word, count in word_counts.items() if count >= min_freq]
vocab = self.special_tokens + vocab # Add special tokens at the start
self.word_to_index = {word: idx for idx, word in enumerate(vocab)}
self.index_to_word = {idx: word for word, idx in self.word_to_index.items()}
def encode(self, text):
"""
Tokenizes text into indices, adds <SOS> and <EOS>, and pads/truncates to max_len.
"""
tokens = text.split()
encoded = [self.word_to_index.get(word, self.word_to_index["<UNK>"]) for word in tokens]
# Add <SOS> and <EOS>
encoded = [self.word_to_index["<SOS>"]] + encoded + [self.word_to_index["<EOS>"]]
# Padding or truncating
if len(encoded) < self.max_len:
encoded.extend([self.word_to_index["<PAD>"]] * (self.max_len - len(encoded)))
else:
encoded = encoded[:self.max_len]
return torch.tensor(encoded, dtype=torch.long)
def decode(self, indices):
"""
Converts token indices back into text.
"""
words = [self.index_to_word.get(idx, "<UNK>") for idx in indices]
return ' '.join([word for word in words if word not in ["<PAD>", "<SOS>", "<EOS>"]])
Model Architecture
The CLT model uses an RNN-based classifier implemented in the CLTModelRNN
class.
class CLTModelRNN(nn.Module):
def __init__(self):
super(CLTModelRNN, self).__init__()
def initialise(self, hidden_size, output_size, batch_size, num_layers, input_size = 100):
self.hidden_size = hidden_size
self.num_layers = num_layers
self.batch_size = batch_size
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
self.h2o = nn.Linear(hidden_size, output_size)
def forward(self, input, hidden):
rnn_out, hidden = self.rnn(input, hidden)
output = self.h2o(rnn_out[:, -1, :])
return output, hidden
def initHidden(self):
return torch.zeros(self.num_layers, self.batch_size, self.hidden_size)
Training Loop
Here is the training loop code used for clt:
def train(self, qthread=None, progress_updated=None, loss_updated=None):
self.loss_fn = nn.CrossEntropyLoss()
if model.__class__.__name__ != 'kNN':
self.optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
total_steps = self.num_epochs
current_step = 0
hidden = self.model.initHidden()
self.model.train()
for epoch in range(self.num_epochs):
total_loss = 0
for x, y in self.train_loader:
hidden = hidden.detach()
x = x.unsqueeze(1).expand(-1, 100, 100).to(torch.float32)
y = y.to(torch.long)
self.optimizer.zero_grad()
y_predicted, hidden = self.model(x, hidden)
loss = self.loss_fn(y_predicted, y)
total_loss += loss.item()
loss.backward()
self.optimizer.step()
current_step += 1
progress = int((current_step / total_steps) * 100)
progress_updated.emit(progress)
qthread.msleep(1)
avg_loss = total_loss / len(self.train_loader)
loss_updated.emit(avg_loss)
Evaluation Loop
Here is the evaluation loop used for CLT:
def evaluate(self, eval_dataset=None):
self.model.eval()
correct_predictions = 0
total_samples = 0
val_loader = DataLoader(eval_dataset, batch_size=self.num_batches, shuffle=False) if eval_dataset else self.val_loader
hidden = self.model.initHidden()
with torch.no_grad():
for x, y in val_loader:
x = x.to(torch.float32).unsqueeze(1).expand(-1, 100, 100)
y = y.to(torch.long)
y_predicted, hidden = self.model(x, hidden)
pred = torch.argmax(y_predicted, dim=1)
correct_predictions += pred.eq(y).sum().item()
total_samples += y.size(0)
return correct_predictions / total_samples if total_samples > 0 else 0
Deployment (Use the Trained Model)
You can use the following code to run your .bin file externally.
class CLTTextClassifier:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def use_model(self, input_text):
encoding = self.tokenizer.encode(input_text)
x = encoding.to(torch.float32).unsqueeze(0).unsqueeze(1).expand(-1, 100, 100)
hidden = self.model.initHidden(1)
self.model.eval()
with torch.no_grad():
output, _ = self.model(x, hidden)
return torch.argmax(output).item()
# Example usage
model = CLTModelRNN()
model.initialise(hidden_size=64, output_size=5, batch_size=1, num_layers=1)
tokenizer = CharTokenizer() # or VocabTokenizer depending on your setup
classifier = CLTTextClassifier(model, tokenizer)
text_input = "This is a test sentence."
prediction = classifier.use_model(text_input)
print("Predicted class:", prediction)