CLI Application
This section explains the internal mechanics of the CLI pipeline. The CLI pipeline can be used to classify images. This document includes accepted data formats, the model architecture used, training details, and deployment instructions for using the .bin
model file after download.
Data Processing Details
GEGT supports a specific format for image-to-text generation tasks. Ensure your data follows the structure below:
Folder Format
The dataset should be organized in a folder that includes:
- A subfolder named
input
containing image files named1.jpg
,2.jpg
,3.jpg
, etc. - A text file named
output.txt
where each line corresponds to the prompt or expected output for the image with the same number.
Example structure:
/your_dataset_folder/
├── input/
│ ├── 1.jpg
│ ├── 2.jpg
│ └── 3.jpg
└── output.txt
output.txt
label for image 1
label for image 2
label for image 3
Model Architecture
A simple convolutional neural network (CNN) is used for image classification. It consists of two convolutional layers, max-pooling, ReLU activations, and a fully connected output layer.
class CLIModelCNN(nn.Module):
def __init__(self):
super(CLIModelCNN, self).__init__()
def initialise(self, input_size: tuple, output_size: int):
in_ch, h, w = input_size
self.cnn1 = nn.Conv2d(in_channels=in_ch, out_channels=32, kernel_size=3)
self.pooling = nn.MaxPool2d(kernel_size=2, stride=2)
self.relu = nn.ReLU()
self.cnn2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3)
self.flatten = nn.Flatten()
self.output = nn.Linear(968256, output_size) # Adjust based on actual input size
print(input_size)
def forward(self, input):
input = self.relu(self.cnn1(input))
input = self.pooling(input)
input = self.relu(self.cnn2(input))
input = self.pooling(input)
flat = self.flatten(input)
print(flat.shape)
output = self.output(flat)
return output
Training Loop
Trains the CNN over multiple epochs. Each epoch involves computing loss and updating model weights using backpropagation.
def train(self, qthread=None, progress_updated=None, loss_updated=None):
self.model.train()
total_steps = self.num_epochs
current_step = 0
for epoch in range(self.num_epochs):
total_loss = 0
for x, y in self.train_loader:
x = x.to(torch.float32)
y = y.to(torch.long)
self.optimizer.zero_grad()
y_predicted = self.model(x)
loss = self.loss_fn(y_predicted, y)
total_loss += loss.item()
loss.backward()
self.optimizer.step()
current_step += 1
progress = int((current_step / total_steps) * 100)
progress_updated.emit(progress)
qthread.msleep(1)
print('finished one epoch')
avg_loss = total_loss / len(self.train_loader)
loss_updated.emit(avg_loss)
Evaluation Loop
Evaluates the model on a validation set by computing classification accuracy.
def evaluate(self, eval_dataset=None):
self.model.eval()
correct_predictions = 0
total_samples = 0
val_loader = DataLoader(eval_dataset, batch_size=self.num_batches, shuffle=False) if eval_dataset else self.val_loader
with torch.no_grad():
for x, y in val_loader:
x = x.to(torch.float32)
y = y.to(torch.long)
y_predicted = self.model(x)
pred = torch.argmax(y_predicted, dim=1)
correct_predictions += pred.eq(y).sum().item()
total_samples += y.size(0)
return correct_predictions / total_samples if total_samples > 0 else 0
Deployment (Use the Trained Model)
This class loads and preprocesses an image for classification using the trained model.
class UserCLI(User):
def __init__(self, model, tokenizer, device, max_length):
super().__init__(model, tokenizer, device, max_length)
def use_model(self, input):
img = Image.open(input).convert("RGB")
img = img.resize((500, 500))
img_array = np.array(img)
img_tensor = torch.tensor(img_array).permute(2, 0, 1).float() / 255.0
x = img_tensor.unsqueeze(0) # Shape: [1, 3, 500, 500]
with torch.no_grad():
y_predicted = self.model(x)
pred = torch.argmax(y_predicted, dim=1)
return pred.item()
model_path = "path/to/model.bin"
model = CLIModelCNN()
model.initialise(input_size=(3, 500, 500), output_size=5) # Match the dimensions used in training
model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
model.eval()
user = UserCLI(model=model, tokenizer=None, device="cpu", max_length=None)
image_path = "path/to/image.jpg" # Path to input image
prediction = user.use_model(image_path)
print("Predicted class:", prediction)
This class provides a simple interface to classify new images after training is complete.