CLN Application
This section explains the internal mechanics of the CLN-ML pipeline. The CLN-ML pipeline can be used to classify medium to large number of number-based data. This document includes accepted data formats, the model architecture used, training details, and deployment instructions for using the .bin
model file after download.
Data Processing Details
Ensuring your data is in the correct format is crucial for the CLT applet to function properly. The applet supports six different formats:
Folder with 2 txt files
Use a folder containing input.txt
and output.txt
, where each line in input.txt
corresponds to a label in output.txt
.
input.txt
23321
44432
output.txt
0
1
TXT file
A .txt
file can also be used, where each line contains an input-output pair separated by a comma.
33212, 0
33215, 1
JSON file
The file should contain a list of objects, each with input
and output
fields.
[
{
"input": "88587",
"output": "0"
},
{
"input": "98747",
"output": "1"
}
]
CSV file
Ensure the first row contains headers: input
and output
.
input,output
88574, 0
66547, 1
XML file
Use <input>
and <output>
tags inside <entry>
elements.
<qaPairs>
<entry>
<input>88784</input>
<output>0</output>
</entry>
</qaPairs>
YAML file
Each entry should include input
and output
fields.
- input: "8874"
output: "0"
Model Architecture
The CLN model uses a simple logistic regression classifier implemented using PyTorch.
class LogisticRegressionCLN(Model, nn.Module):
def __init__(self):
super(LogisticRegressionCLN, self).__init__()
self.linear = None
def initialise(self, d):
self.linear = nn.Linear(d, 1)
def forward(self, x): # x is of size n*d
x = x.to(torch.float32)
y_predicted = torch.sigmoid(self.linear(x))
return y_predicted
Training Loop
def train(self, qthread=None, progress_updated=None, loss_updated=None):
self.num_batches = len(self.train_loader)
total_steps = len(self.train_loader)
current_step = 0
total_loss = 0
for x, y in self.train_loader:
y_predicted = self.model(x)
y = y.unsqueeze(1).to(torch.float32)
loss = self.loss_fn(y_predicted, y)
total_loss += loss.item()
loss.backward()
self.optimizer.step()
current_step += 1
progress = int((current_step / total_steps) * 100)
progress_updated.emit(progress)
qthread.msleep(1)
print('done one')
avg_loss = total_loss / len(self.train_loader)
loss_updated.emit(avg_loss)
Evaluation Loop
def evaluate(self, eval_dataset=None):
self.model.eval()
total_loss = 0
num = 0
val_loader = DataLoader(eval_dataset, batch_size=1, shuffle=False) if eval_dataset else self.val_loader
with torch.no_grad():
for x, y in val_loader:
outputs = self.model(x)
y = y.unsqueeze(1).to(torch.float32)
if torch.equal(y, torch.round(outputs)):
total_loss += 1
num += 1
return total_loss / num
Deployment (Use the Trained Model)
class UserCLN(User):
def __init__(self, model, tokenizer, device, max_length):
super().__init__(model, tokenizer, device, max_length)
def use_model(self, input):
output = self.model(input)
return torch.round(output)
Example Usage
class UserCLN(User):
def __init__(self, model , tokenizer, device, max_length):
super().__init__(model, tokenizer, device, max_length)
def use_model(self, input):
output = self.model(input)
return torch.round(output)
model_path = "path/to/model.bin"
model = LogisticRegressionCLN()
model.initialise(d=10) # Same feature dimension used in training
model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
model.eval()
input_tensor = torch.rand(1, 10) # Simulate test input
user = UserCLN(model=model, tokenizer=None, device="cpu", max_length=None)
prediction = user.use_model(input_tensor)
print("Predicted class:", int(prediction.item()))