š¦ Imports
import pandas as pd
import numpy as np
import re
import string
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
š„ NLTK Stopwords
nltk.download(‘stopwords’)
ā Text Cleaning Function
def clean_text(text):
text = str(text).lower()
text = re.sub(r'<.*?>’, ”, text) # Remove HTML
text = re.sub(r”http\S+”, “”, text) # Remove URLs
text = re.sub(r”\d+”, “”, text) # Remove numbers
text = text.translate(str.maketrans(”, ”, string.punctuation)) # Remove punctuation
text = re.sub(r’\s+’, ‘ ‘, text).strip() # Remove extra spaces
stop_words = set(stopwords.words(‘english’))
words = [word for word in text.split() if word not in stop_words]
return ‘ ‘.join(words)
š Load Dataset
df = pd.read_csv(“text_data.csv”) # š Replace with your dataset
df = df.dropna()
š Clean Text
df[‘cleaned_review’] = df[‘review’].apply(clean_text) # š Replace ‘review’ column
šÆ Encode Target
le = LabelEncoder()
df[‘label_encoded’] = le.fit_transform(df[‘sentiment’]) # š Replace ‘sentiment’
āļø Train-Test Split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
df[‘cleaned_review’], df[‘label_encoded’], test_size=0.2, random_state=42
)
š TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=10000)
X_train = tfidf.fit_transform(X_train_raw)
X_test = tfidf.transform(X_test_raw)
tfidf = TfidfVectorizer( lowercase=True, # Convert all characters to lowercase stop_words=’english’, # Remove common English stopwords max_features=1000, # Limit to top N features by TF-IDF score ngram_range=(1, 2), # Use unigrams and bigrams max_df=0.85, # Ignore terms in >85% of documents min_df=2, # Include only terms in ā„2 docs strip_accents=’unicode’, # Normalize accents use_idf=True, # Enable inverse-document-frequency reweighting smooth_idf=True, # Add 1 to idf to avoid division by zero sublinear_tf=True # Use sublinear tf scaling (1 + log(tf)))# Fit and transformX = tfidf.fit_transform(corpus)
import requestsfrom bs4 import BeautifulSoup# URL to scrapeurl = ‘https://applied.iot/ml’ # Make sure this is the correct full URL# Send a GET requestresponse = requests.get(url)# Check statusif response.status_code == 200: soup = BeautifulSoup(response.text, ‘html.parser’) # Extract all text page_text = soup.get_text(separator=’\n’, strip=True) print(page_text[:1000]) # Print first 1000 characterselse: print(f”Failed to access page. Status code: {response.status_code}”)
š¤ Define Models
models = {
“Logistic Regression”: LogisticRegression(),
“Naive Bayes”: MultinomialNB(),
“Support Vector Machine”: SVC(),
“Decision Tree”: DecisionTreeClassifier(),
“Random Forest”: RandomForestClassifier()
}
results = []
š Train and Evaluate Models
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f”\nš [{name}]”)
print(“Accuracy:”, acc)
print(“Classification Report:\n”, classification_report(y_test, y_pred))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title(f"{name} - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
# Save Results
results.append({"Model": name, "Accuracy": acc})
š Summary Chart
results_df = pd.DataFrame(results).sort_values(by=”Accuracy”, ascending=False)
print(“\nš Model Accuracy Comparison:”)
print(results_df)
š Bar Chart
sns.barplot(data=results_df, x=”Model”, y=”Accuracy”, palette=”viridis”)
plt.xticks(rotation=45)
plt.title(“Model Accuracy Comparison”)
plt.ylabel(“Accuracy Score”)
plt.ylim(0, 1)
plt.tight_layout()
plt.show()