STEP 1: Imports
##’https://appliediot.in/Regression-Examples/
###’https://appliediot.in/classification-examples/
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
nltk.download(‘stopwords’)
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
STEP 2: Load data
df = pd.read_csv(‘text_reviews.csv’)
STEP 3: Clean the text
stop_words = set(stopwords.words(‘english’))
def clean_text(text):
text = BeautifulSoup(text, “html.parser”).get_text() # Remove HTML
text = text.lower() # Lowercase
text = re.sub(r’\d+’, ”, text) # Remove digits
text = text.translate(str.maketrans(”, ”, string.punctuation)) # Remove punctuation
text = re.sub(r”http\S+”, “”, text) # remove URLs
text = re.sub(r”@\w+”, “”, text) # remove mentions
text = re.sub(r”#\w+”, “”, text) # remove hashtags
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
text = re.sub(‘[^a-zA-Z]’, ‘ ‘, text) #Remove Non-Alphabetic Characters
####
import pandas as pd
import numpy as np
Sample data
data = {
‘date’: pd.date_range(start=’2023-01-01′, periods=10, freq=’D’),
‘sales’: np.random.randint(100, 200, size=10)
}
Create DataFrame
df = pd.DataFrame(data)
——————————-
1. Convert to datetime (if not already)
df[‘date’] = pd.to_datetime(df[‘date’])
2. Set date as index
df.set_index(‘date’, inplace=True)
3. Date attributes
df[‘year’] = df.index.year
df[‘month’] = df.index.month
df[‘day’] = df.index.day
df[‘dayofweek’] = df.index.dayofweek
4. Resampling
monthly_avg = df.resample(‘M’).mean()
weekly_sum = df.resample(‘W’).sum()
5. Filter by date range
filtered = df[‘2023-01-03′:’2023-01-06’]
6. Create custom date range
custom_range = pd.date_range(start=’2023-02-01′, periods=5, freq=’D’)
7. Sorting by date (already sorted, but example)
df = df.sort_index()
8. Format dates (for string parsing, example)
df_reset = df.reset_index()
df_reset[‘date’] = pd.to_datetime(df_reset[‘date’], format=’%Y-%m-%d’)
9. Add date offset (add 1 month to index)
df_offset = df.copy()
df_offset.index = df_offset.index + pd.DateOffset(months=1)
10. Fill missing dates (simulate missing dates first)
df_missing = df.copy()
df_missing = df_missing.drop(df_missing.index[3]) # drop 4th row to simulate gap
df_missing = df_missing.asfreq(‘D’) # reindex with daily freq
df_missing = df_missing.fillna(method=’ffill’) # forward fill
Show final outputs
print(“Original Data:\n”, df.head(), ‘\n’)
print(“Monthly Avg:\n”, monthly_avg, ‘\n’)
print(“Weekly Sum:\n”, weekly_sum, ‘\n’)
print(“Filtered Dates:\n”, filtered, ‘\n’)
print(“Date with Offset:\n”, df_offset.head(), ‘\n’)
print(“Missing Dates Handled:\n”, df_missing.head(), ‘\n’)
####
import pandas as pd
Sample DataFrame
df = pd.DataFrame({
‘a’: [1, 2, 3, 4],
‘b’: [10, 20, 30, 40],
‘first_name’: [‘John’, ‘Alice’, ‘Bob’, ‘Eve’],
‘last_name’: [‘Doe’, ‘Smith’, ‘Brown’, ‘White’]
})
🔹 1. Add two numeric columns
df[‘sum’] = df.apply(lambda row: row[‘a’] + row[‘b’], axis=1)
🔹 2. Conditional labeling using multiple columns
df[‘label’] = df.apply(
lambda row: ‘high’ if row[‘a’] + row[‘b’] > 25 else ‘low’,
axis=1
)
🔹 3. Create full name from first and last names
df[‘full_name’] = df.apply(lambda row: f”{row[‘first_name’]} {row[‘last_name’]}”, axis=1)
🔹 4. Create a new feature with a custom formula (e.g., weighted score)
df[‘weighted’] = df.apply(lambda row: 0.3 * row[‘a’] + 0.7 * row[‘b’], axis=1)
🔹 5. Use assign() + lambda for clean chaining (optional)
df = df.assign(
diff=lambda x: x[‘b’] – x[‘a’], # Difference between columns
score_label=lambda x: [‘pass’ if v >= 20 else ‘fail’ for v in x[‘weighted’]]
)
✅ Final Output
print(df)
####
# Remove emojis and non-ASCII characters
text = re.sub(r'[^\x00-\x7F]+’, ”, text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
# Remove stopwords
stop_words = set(stopwords.words('english'))
text = " ".join([word for word in text.split() if word not in stop_words])
# Stemming
stemmer = PorterStemmer()
text = " ".join([stemmer.stem(word) for word in text.split()])
# Lemmatization (optional, after stemming or instead of it)
lemmatizer = WordNetLemmatizer()
text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
return text
df[‘cleaned_review’] = df[‘Review’].apply(clean_text)
STEP 4: Convert text to vectors using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df[‘cleaned_review’])
STEP 5: Encode target labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df[‘Sentiment’])
STEP 6: Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
STEP 7: Train model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
STEP 8: Predict and evaluate
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test)
print(“Accuracy:”, accuracy_score(y_test, y_pred))
print(“Classification Report:\n”, classification_report(y_test, y_pred, target_names=le.classes_))