CLT Application

This section explains the internal mechanics of the CLT applet. It includes accepted data formats, the model architecture used, training details, and deployment instructions for using the .bin model file after download.

Data Processing Details

Ensuring your data is in the correct format is crucial for the CLT applet to function properly. The applet supports six different formats:

Folder with 2 txt files

Use a folder containing input.txt and output.txt, where each line in input.txt corresponds to a label in output.txt.

input.txt

cat
dog

output.txt

0
1

TXT file

A .txt file can also be used, where each line contains an input-output pair separated by a comma.

cat, 0
dog, 1

JSON file

The file should contain a list of objects, each with input and output fields.

[
  {
    "input": "cat",
    "output": "0"
  },
  {
    "input": "dog",
    "output": "1"
  }
]

CSV file

Ensure the first row contains headers: input and output.

input,output
cat, 0
dog, 1

XML file

Use <input> and <output> tags inside <entry> elements.

<qaPairs>
  <entry>
    <input>cat</input>
    <output>0</output>
  </entry>
</qaPairs>

YAML file

Each entry should include input and output fields.

- input: "cat"
  output: "0"

Tokenization

This pipeline tokenizes text data dpeending on characters’ occurance or word occurance. You can choose between the two types of tokenization


import string

class CharTokenizer:
    def __init__(self, max_len=100):
        # Define the character set, adding special tokens if necessary
        self.characters = string.ascii_letters + string.digits + string.punctuation + " \n\t"
        self.char_to_index = {char: idx for idx, char in enumerate(self.characters)}
        self.index_to_char = {idx: char for char, idx in self.char_to_index.items()}
        self.max_len = max_len  # Maximum length of the sequence
    
    def build_vocab(self, corpus = None, min_freq = None):
        pass
    
    def encode(self, text):
        # Encode the text into indices
        encoded = [self.char_to_index.get(char, self.char_to_index[' ']) for char in text]

        # Pad or truncate to the max_len
        if len(encoded) < self.max_len:
            encoded.extend([0] * (self.max_len - len(encoded)))  # Padding with 0
        else:
            encoded = encoded[:self.max_len]  # Truncate to max_len

        return torch.tensor(encoded)

    def decode(self, indices):
        # Decode the indices back to the original text
        return ''.join([self.index_to_char.get(idx, '') for idx in indices])


import torch
import string
from collections import Counter

class VocabTokenizer:
    def __init__(self, corpus=None, max_len=100, min_freq=1):
        """
        Initializes the tokenizer with a vocabulary built from the given corpus.
        :param corpus: List of texts to build vocabulary from (optional).
        :param max_len: Maximum sequence length.
        :param min_freq: Minimum frequency for a token to be included in the vocabulary.
        """
        self.max_len = max_len
        self.special_tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]

        if corpus:
            self.build_vocab(corpus, min_freq)
        else:
            self.word_to_index = {tok: idx for idx, tok in enumerate(self.special_tokens)}
            self.index_to_word = {idx: tok for tok, idx in self.word_to_index.items()}

    def build_vocab(self, corpus, min_freq):
        """
        Builds a vocabulary from a corpus, filtering words with frequency < min_freq.
        """
        word_counts = Counter()
        for text in corpus:
            word_counts.update(text.split())

        vocab = [word for word, count in word_counts.items() if count >= min_freq]
        vocab = self.special_tokens + vocab  # Add special tokens at the start

        self.word_to_index = {word: idx for idx, word in enumerate(vocab)}
        self.index_to_word = {idx: word for word, idx in self.word_to_index.items()}

    def encode(self, text):
        """
        Tokenizes text into indices, adds <SOS> and <EOS>, and pads/truncates to max_len.
        """
        tokens = text.split()
        encoded = [self.word_to_index.get(word, self.word_to_index["<UNK>"]) for word in tokens]

        # Add <SOS> and <EOS>
        encoded = [self.word_to_index["<SOS>"]] + encoded + [self.word_to_index["<EOS>"]]

        # Padding or truncating
        if len(encoded) < self.max_len:
            encoded.extend([self.word_to_index["<PAD>"]] * (self.max_len - len(encoded)))
        else:
            encoded = encoded[:self.max_len]

        return torch.tensor(encoded, dtype=torch.long)

    def decode(self, indices):
        """
        Converts token indices back into text.
        """
        words = [self.index_to_word.get(idx, "<UNK>") for idx in indices]
        return ' '.join([word for word in words if word not in ["<PAD>", "<SOS>", "<EOS>"]])

Model Architecture

The CLT model uses an RNN-based classifier implemented in the CLTModelRNN class.

class CLTModelRNN(nn.Module):
    def __init__(self):
        super(CLTModelRNN, self).__init__()

    def initialise(self, hidden_size, output_size, batch_size, num_layers, input_size = 100):
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.h2o = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        rnn_out, hidden = self.rnn(input, hidden)
        output = self.h2o(rnn_out[:, -1, :])
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers, self.batch_size, self.hidden_size)

Training Loop

Here is the training loop code used for clt:

def train(self, qthread=None, progress_updated=None, loss_updated=None):
    self.loss_fn = nn.CrossEntropyLoss()
    if model.__class__.__name__ != 'kNN':
        self.optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)

    total_steps = self.num_epochs
    current_step = 0
    hidden = self.model.initHidden()
    
    self.model.train()
    for epoch in range(self.num_epochs):
        total_loss = 0
        for x, y in self.train_loader:
            hidden = hidden.detach()
            x = x.unsqueeze(1).expand(-1, 100, 100).to(torch.float32)
            y = y.to(torch.long)

            self.optimizer.zero_grad()
            y_predicted, hidden = self.model(x, hidden)
            loss = self.loss_fn(y_predicted, y)
            total_loss += loss.item()
            loss.backward()
            self.optimizer.step()

        current_step += 1
        progress = int((current_step / total_steps) * 100)
        progress_updated.emit(progress)
        qthread.msleep(1)
        avg_loss = total_loss / len(self.train_loader)
        loss_updated.emit(avg_loss)

Evaluation Loop

Here is the evaluation loop used for CLT:

def evaluate(self, eval_dataset=None):
    self.model.eval()
    correct_predictions = 0
    total_samples = 0

    val_loader = DataLoader(eval_dataset, batch_size=self.num_batches, shuffle=False) if eval_dataset else self.val_loader
    hidden = self.model.initHidden()

    with torch.no_grad():
        for x, y in val_loader:
            x = x.to(torch.float32).unsqueeze(1).expand(-1, 100, 100)
            y = y.to(torch.long)
            y_predicted, hidden = self.model(x, hidden)
            pred = torch.argmax(y_predicted, dim=1)
            correct_predictions += pred.eq(y).sum().item()
            total_samples += y.size(0)

    return correct_predictions / total_samples if total_samples > 0 else 0

Deployment (Use the Trained Model)

You can use the following code to run your .bin file externally.

class CLTTextClassifier:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def use_model(self, input_text):
        encoding = self.tokenizer.encode(input_text)
        x = encoding.to(torch.float32).unsqueeze(0).unsqueeze(1).expand(-1, 100, 100)
        hidden = self.model.initHidden(1)
        self.model.eval()
        with torch.no_grad():
            output, _ = self.model(x, hidden)
        return torch.argmax(output).item()

# Example usage
model = CLTModelRNN()
model.initialise(hidden_size=64, output_size=5, batch_size=1, num_layers=1)
tokenizer = CharTokenizer()  # or VocabTokenizer depending on your setup

classifier = CLTTextClassifier(model, tokenizer)
text_input = "This is a test sentence."
prediction = classifier.use_model(text_input)
print("Predicted class:", prediction)