Math ABC: December 2025

Tuesday, 30 December 2025

Another AI: Simple Fuzzy Logic to predict XOR Logic Output

Implementing XOR with Fuzzy Logic in Python

While neural networks learn the XOR function through training, fuzzy logic can solve it directly using human-like rules and membership degrees. In this simple Python script, we build a Mamdani-style fuzzy inference system with just two fuzzy sets ("Low" and "High") and four intuitive rules — perfectly replicating the XOR truth table without any training loop. The result? Crisp 0 or 1 outputs for binary inputs, and smooth, interpretable responses for real-valued or noisy data. A clean, educational alternative to neural networks that shows the power of fuzzy reasoning.

Python Code from Grok

import numpy as np  # Not strictly needed here, but kept for consistency

# Membership functions: simple linear for crisp 0/1 behavior
def membership_low(x):
    """Degree to which x belongs to 'Low' (0)"""
    return max(0.0, min(1.0, 1.0 - x))  # Clips to [0,1]

def membership_high(x):
    """Degree to which x belongs to 'High' (1)"""
    return max(0.0, min(1.0, x))

# Fuzzy XOR using Mamdani-style inference
def fuzzy_xor(a, b):
    # Fuzzification
    low_a  = membership_low(a)
    high_a = membership_high(a)
    low_b  = membership_low(b)
    high_b = membership_high(b)
    
    # Fuzzy rules for XOR:
    # 1. If A is Low  and B is Low  → Output Low
    # 2. If A is Low  and B is High → Output High
    # 3. If A is High and B is Low  → Output High
    # 4. If A is High and B is High → Output Low
    
    rule1 = min(low_a, low_b)    # → Low
    rule2 = min(low_a, high_b)   # → High
    rule3 = min(high_a, low_b)   # → High
    rule4 = min(high_a, high_b)  # → Low
    
    # Aggregation: collect strength for Low and High output
    strength_low  = max(rule1, rule4)
    strength_high = max(rule2, rule3)
    
    # Defuzzification: center of gravity (centroids at 0 and 1)
    total = strength_low + strength_high
    if total == 0:
        return 0.5  # Neutral (shouldn't happen with valid inputs)
    
    output = (strength_low * 0 + strength_high * 1) / total
    return output

# Display results on standard XOR training data
print("Fuzzy Logic XOR - Results on training data:")
print("-" * 50)
training_inputs = [[0, 0], [0, 1], [1, 0], [1, 1]]
expected_outputs = [0, 1, 1, 0]

for inp, expected in zip(training_inputs, expected_outputs):
    result = fuzzy_xor(inp[0], inp[1])
    print(f"Input: {inp} → Output: {result:.3f}  (expected: {expected})")

print("\n" + "="*60)
print("   Interactive Fuzzy XOR Tester")
print("   Enter two numbers (0 or 1 recommended, but any real value works)")
print("="*60)

# Interactive loop for user input
while True:
    try:
        print("\nEnter two inputs separated by space (or type 'quit' to exit):")
        user_input = input("> ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q', '']:
            print("Goodbye!")
            break
        
        parts = user_input.split()
        if len(parts) != 2:
            print("Error: Please enter exactly two numbers.")
            continue
        
        try:
            a = float(parts[0])
            b = float(parts[1])
        except ValueError:
            print("Error: Invalid numbers. Please enter numeric values.")
            continue
        
        # Compute fuzzy XOR
        prediction = fuzzy_xor(a, b)
        binary_prediction = 1 if prediction >= 0.5 else 0
        
        # Classic boolean XOR for comparison (using threshold 0.5)
        classic_xor = 1 if (a > 0.5) != (b > 0.5) else 0
        
        print(f"\nInput:         [{a}, {b}]")
        print(f"Fuzzy output:  {prediction:.4f}")
        print(f"Rounded class: {binary_prediction}")
        print(f"Classic XOR:   {classic_xor}  (for reference, using >0.5 threshold)")
        
    except KeyboardInterrupt:
        print("\nGoodbye!")
        break
    except Exception as e:
        print(f"Unexpected error: {e}")

Sunday, 28 December 2025

Simple ANN to predict XOR Logic Output

We can see the training results, predicted the XOR function correctly (close to 0, 1, 1, 0).

When prompt to key in two input, we can also see the output is being predicted correctly.

Python Code from Grok

import numpy as np

# Sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Derivative of sigmoid for backpropagation
def sigmoid_derivative(x):
    return x * (1 - x)

# Input data (XOR truth table)
X = np.array([[0, 0],
              [0, 1],
              [1, 0],
              [1, 1]])

# Expected output
y = np.array([[0],
              [1],
              [1],
              [0]])

# Seed for reproducibility
np.random.seed(42)

# Network architecture
input_neurons = 2
hidden_neurons = 4
output_neurons = 1

# Randomly initialize weights
weights_input_hidden = np.random.uniform(size=(input_neurons, hidden_neurons))
weights_hidden_output = np.random.uniform(size=(hidden_neurons, output_neurons))

# Learning rate
learning_rate = 1.0

# Training loop
print("Training the neural network on XOR...")
for epoch in range(10000):
    # Forward pass
    hidden_input = np.dot(X, weights_input_hidden)
    hidden_output = sigmoid(hidden_input)
    
    final_input = np.dot(hidden_output, weights_hidden_output)
    predicted_output = sigmoid(final_input)
    
    # Compute error
    error = y - predicted_output
    
    # Backpropagation
    d_output = error * sigmoid_derivative(predicted_output)
    
    error_hidden = d_output.dot(weights_hidden_output.T)
    d_hidden = error_hidden * sigmoid_derivative(hidden_output)
    
    # Update weights
    weights_hidden_output += hidden_output.T.dot(d_output) * learning_rate
    weights_input_hidden += X.T.dot(d_hidden) * learning_rate

print("Training complete!\n")

# Show results on training data
print("Results on training data (XOR):")
for i in range(len(X)):
    print(f"Input: {X[i]} → Predicted: {predicted_output[i][0]:.3f} (expected: {y[i][0]})")

print("\n" + "="*40)
print("Now you can test the network yourself!")
print("="*40)

# Interactive prediction loop
while True:
    try:
        print("\nEnter two binary inputs (0 or 1), separated by space (or type 'quit' to exit):")
        user_input = input("> ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break
        
        values = list(map(float, user_input.split()))
        
        if len(values) != 2:
            print("Please enter exactly two numbers.")
            continue
        
        if not all(v in [0, 1] for v in values):
            print("Please enter only 0 or 1 for each input.")
            continue
        
        # Convert to numpy array and predict
        input_data = np.array([values])
        
        hidden_layer = sigmoid(np.dot(input_data, weights_input_hidden))
        output = sigmoid(np.dot(hidden_layer, weights_hidden_output))
        
        prediction = output[0][0]
        rounded = 1 if prediction >= 0.5 else 0
        
        print(f"\nInput:  [{values[0]}, {values[1]}]")
        print(f"Network output: {prediction:.4f}")
        print(f"Predicted class: {rounded} → This is XOR: {int(values[0] != values[1])}")
        
    except ValueError:
        print("Invalid input. Please enter numbers separated by space.")
    except Exception as e:
        print(f"Error: {e}")

Saturday, 27 December 2025

Sigmoid function(ANN Activation Function) and Its Shape

sigma open paren x close paren equals the fraction with numerator 1 and denominator 1 plus e raised to the negative x power end-fraction

Sigmoid function has the formula as above. The sigmoid function maps the input x to a value between 0 and 1.

Activation functions are a crucial component of neural networks, responsible for introducing non-linearity into the model. Without non-linear activation functions, a neural network would essentially behave like a linear model, regardless of the number of layers, limiting its capacity to solve complex problems. The "best" activation function is often found by quick experimentation on your specific dataset.

How People choose the activation function for ANN?

Standard rules of thumb (what practitioners actually do)

🔹 Hidden layers (most important choice)

Activation	When used	Why
ReLU	Default choice	Simple, fast, no vanishing gradient
Leaky ReLU / GELU	Deeper or transformer models	Fix “dead ReLU” problem
tanh	Small networks	Zero-centered but vanishing gradients
sigmoid	Rare today	Severe vanishing gradient

👉 Rule:

If unsure → start with ReLU (or GELU)

🔹 Output layer (data-dependent)

Here people do consider the data.

Problem type	Output activation	Reason
Binary classification	Sigmoid	Outputs probability (0–1)
Multi-class (one label)	Softmax	Class probabilities sum to 1
Regression (unbounded)	Linear	No restriction
Regression (0–1)	Sigmoid	Bounded output
Regression (−1 to 1)	tanh	Symmetric range

This is the only place where data range strongly drives activation choice.

Just Simulation

Example of Sigmoid Curve Plotting

How to interpret the graph

Early stage (0–20 hrs): slow gains while fundamentals are forming
Middle stage (30–60 hrs): rapid skill growth (steep slope)
Later stage (70+ hrs): plateau as learning saturates

References

Johnson, Peter. Fundamentals of Machine Learning: An Introduction to Neural Networks (p. 85). Kindle Edition.

Friday, 26 December 2025

Neural Network ABC: What is a Weight?

The basic computational unit of a neural network is the neuron, which aggregates incoming signals, applies a weighted sum, and passes the result through an activation function.

Assume y=mx + c, then we see the w as m. W is the weight.

To learn what is a weight, we can try to learn from this simple example.

What is a Weighted Average?A weighted average is a type of average where each value contributes to the final result based on its importance or weight. Unlike a simple (arithmetic) average where every value has equal influence, in a weighted average, values with higher weights have more impact.Formula

Simple Example: School GradesSuppose your final grade in a class is calculated as follows:

Homework: 20% of the grade → average score = 85
Quizzes: 30% of the grade → average score = 90
Final Exam: 50% of the grade → score = 78

Here, the weights are the percentages (0.20, 0.30, 0.50), and the values are the scores.Step-by-step calculation:

Multiply each score by its weight:
- Homework: 85 × 0.20 = 17.0
- Quizzes: 90 × 0.30 = 27.0
- Final Exam: 78 × 0.50 = 39.0
Sum these products:
- 17.0 + 27.0 + 39.0 = 83.0
Sum of weights = 0.20 + 0.30 + 0.50 = 1.00 (or 100%)
Final grade = 83.0 / 1.00 = 83.0

Your final grade is 83.

References

Johnson, Peter. Fundamentals of Machine Learning: An Introduction to Neural Networks (p. 67). Kindle Edition.

https://statisticsbyjim.com/basics/weighted-average/

Thursday, 25 December 2025

2 Dimension reduction to 1 Dimension using Principal Component Analysis

Understanding Eigenvalues and Principal Component Analysis (PCA)

What Are Eigenvalues?Imagine a square matrix as a "transformation machine" that takes vectors (arrows in space) as input and spits out transformed vectors. Most directions get twisted or bent, but there are special directions—called eigenvectors—that pass through unchanged except for being stretched or shrunk (or flipped).The eigenvalue (λ) is the scaling factor that tells you how much the eigenvector is stretched or shrunk:
Why do they matter?

In real applications like Google’s PageRank, the largest eigenvalue helps rank web pages by importance.

Eigenvalues (sorted descending):
[1.73716614 0.05354182]

Total variance: 1.79070796
Variance explained by PC1: 1.73716614
Percentage by PC1: 96.99 %

PCA with Explicit Eigenvalues HighlightedIn Principal Component Analysis (PCA), eigenvalues play a central role:

We compute the eigen decomposition of the covariance matrix.
The eigenvectors give the directions of the new axes (principal components).
The eigenvalues tell us how much variance (spread) each principal component captures.
We sort by descending eigenvalues and keep the top ones — here, the largest eigenvalue corresponds to PC1, capturing ~97% of the total variance, so reducing to 1D loses almost nothing.

What Is Principal Component Analysis (PCA)?PCA is a technique to simplify high-dimensional data while keeping as much information as possible. It does this by finding new axes (principal components) aligned with the directions of greatest variance.Here’s how PCA works step by step:

Center the data — subtract the mean so the cloud is centered at the origin.
Compute the covariance matrix — measures how features vary together.
Perform eigen decomposition on the covariance matrix.
- The eigenvectors become the new axes (principal components).
- The eigenvalues tell you how much variance each axis captures.
Sort by eigenvalues (largest first) and project the data onto the top k components.

The first principal component (PC1) is the direction of maximum spread. PC2 is the next, perpendicular to PC1, and so on.
Why use PCA?

Dimensionality reduction: Turn 1000 features into 50 without losing much information.
Visualization: Plot high-dimensional data in 2D or 3D.
Noise removal: Small eigenvalues often correspond to noise; dropping them cleans the data.
Speed: Fewer dimensions make machine learning models faster and less prone to overfitting.

A classic example is facial recognition ("eigenfaces"). Thousands of pixel values per image are reduced to a handful of principal components that capture the main variations (lighting, expression, pose), allowing efficient storage and comparison.The Key Connection: Eigenvalues Power PCAEigenvalues are the heart of PCA. They quantify "importance":

Large eigenvalue → that direction explains a lot of the data’s variability → keep it.
Small eigenvalue → little information → safe to discard.

In practice, you might keep enough components to explain 95% of the total variance (sum of all eigenvalues).
Final ThoughtsEigenvalues help us understand the core scaling behavior of linear transformations, while PCA uses them to intelligently compress and reveal structure in data. Together, they’re essential tools in data analysis, machine learning, image processing, and even physics.Next time you hear about "reducing dimensions" or "finding principal directions," you’ll know it’s eigenvalues doing the heavy lifting behind the scenes!

Python code from Grok

import numpy as np
import matplotlib.pyplot as plt

# Dataset
data = np.array([[2.5, 2.4],
                 [0.5, 0.7],
                 [2.2, 2.9],
                 [1.9, 2.2],
                 [3.1, 3.0]])

# Center the data
mean = np.mean(data, axis=0)
centered_data = data - mean

# Covariance matrix and eigen decomposition
cov_matrix = np.cov(data.T)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sort by largest eigenvalue
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]

# Print eigenvalues and variance explained
print("Eigenvalues (sorted descending):")
print(eigenvalues)
print("\nTotal variance:", eigenvalues.sum())
print("Variance explained by PC1:", eigenvalues[0])
print("Percentage by PC1:", 100 * eigenvalues[0] / eigenvalues.sum(), "%")

# First principal component
pc1 = eigenvectors[:, 0]

# Project onto PC1 (1D scores)
projected_scores = centered_data @ pc1

# Reconstructed points in original 2D space from 1D projection
reconstructed_2d = np.outer(projected_scores, pc1) + mean

# Plot
fig, axs = plt.subplots(1, 2, figsize=(16, 7))

# Left panel: Original data with PC1 and PC2 directions (both eigenvectors)
axs[0].scatter(data[:, 0], data[:, 1], color='blue', s=100, label='Original data points')
axs[0].scatter(mean[0], mean[1], color='red', marker='X', s=250, label='Mean (center)')
scale = 2.5
axs[0].arrow(mean[0], mean[1], eigenvectors[0,0]*scale, eigenvectors[1,0]*scale,
             head_width=0.15, head_length=0.2, fc='green', ec='green', linewidth=4,
             label=f'PC1 (eigenvalue ≈ {eigenvalues[0]:.3f})')
axs[0].arrow(mean[0], mean[1], eigenvectors[0,1]*scale, eigenvectors[1,1]*scale,
             head_width=0.15, head_length=0.2, fc='orange', ec='orange', linewidth=4,
             label=f'PC2 (eigenvalue ≈ {eigenvalues[1]:.3f})')
axs[0].set_xlabel('X (original feature 1)')
axs[0].set_ylabel('Y (original feature 2)')
axs[0].set_title('Original 2D Data with Principal Components\n(Eigenvectors of Covariance Matrix)')
axs[0].grid(True)
axs[0].legend()
axs[0].axis('equal')

# Right panel: Projections shown in X-Y space
axs[1].scatter(data[:, 0], data[:, 1], color='lightblue', alpha=0.6, s=100, label='Original points')
axs[1].scatter(reconstructed_2d[:, 0], reconstructed_2d[:, 1], color='red', s=100, label='Projected points (1D)')

# Draw the principal axis line (PC1)
t = np.linspace(projected_scores.min() - 1, projected_scores.max() + 1, 100)
line_x = mean[0] + t * pc1[0]
line_y = mean[1] + t * pc1[1]
axs[1].plot(line_x, line_y, color='green', linewidth=4, label='1D Principal Axis (PC1)')

# Draw dashed lines from original to projected points
for i in range(len(data)):
    orig = data[i]
    proj = reconstructed_2d[i]
    axs[1].plot([orig[0], proj[0]], [orig[1], proj[1]], color='gray', linestyle='--', linewidth=1.5)

axs[1].set_xlabel('X (original feature 1)')
axs[1].set_ylabel('Y (original feature 2)')
axs[1].set_title('After PCA: Data Reduced to 1D\n(Large eigenvalue direction captures most variance)')
axs[1].grid(True)
axs[1].legend()
axs[1].axis('equal')

plt.tight_layout()
plt.show()