STEP 1: Imports

##’https://appliediot.in/Regression-Examples/

###’https://appliediot.in/classification-examples/

import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
nltk.download(‘stopwords’)
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

STEP 2: Load data

df = pd.read_csv(‘text_reviews.csv’)

STEP 3: Clean the text

stop_words = set(stopwords.words(‘english’))

def clean_text(text):
text = BeautifulSoup(text, “html.parser”).get_text() # Remove HTML
text = text.lower() # Lowercase
text = re.sub(r’\d+’, ”, text) # Remove digits
text = text.translate(str.maketrans(”, ”, string.punctuation)) # Remove punctuation

text = re.sub(r”http\S+”, “”, text) # remove URLs
text = re.sub(r”@\w+”, “”, text) # remove mentions
text = re.sub(r”#\w+”, “”, text) # remove hashtags

from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

text = re.sub(‘[^a-zA-Z]’, ‘ ‘, text) #Remove Non-Alphabetic Characters

####

import pandas as pd
import numpy as np

Sample data

data = {
‘date’: pd.date_range(start=’2023-01-01′, periods=10, freq=’D’),
‘sales’: np.random.randint(100, 200, size=10)
}

Create DataFrame

df = pd.DataFrame(data)

——————————-

1. Convert to datetime (if not already)

df[‘date’] = pd.to_datetime(df[‘date’])

2. Set date as index

df.set_index(‘date’, inplace=True)

3. Date attributes

df[‘year’] = df.index.year
df[‘month’] = df.index.month
df[‘day’] = df.index.day
df[‘dayofweek’] = df.index.dayofweek

4. Resampling

monthly_avg = df.resample(‘M’).mean()
weekly_sum = df.resample(‘W’).sum()

5. Filter by date range

filtered = df[‘2023-01-03′:’2023-01-06’]

6. Create custom date range

custom_range = pd.date_range(start=’2023-02-01′, periods=5, freq=’D’)

7. Sorting by date (already sorted, but example)

df = df.sort_index()

8. Format dates (for string parsing, example)

df_reset = df.reset_index()
df_reset[‘date’] = pd.to_datetime(df_reset[‘date’], format=’%Y-%m-%d’)

9. Add date offset (add 1 month to index)

df_offset = df.copy()
df_offset.index = df_offset.index + pd.DateOffset(months=1)

10. Fill missing dates (simulate missing dates first)

df_missing = df.copy()
df_missing = df_missing.drop(df_missing.index[3]) # drop 4th row to simulate gap
df_missing = df_missing.asfreq(‘D’) # reindex with daily freq
df_missing = df_missing.fillna(method=’ffill’) # forward fill

Show final outputs

print(“Original Data:\n”, df.head(), ‘\n’)
print(“Monthly Avg:\n”, monthly_avg, ‘\n’)
print(“Weekly Sum:\n”, weekly_sum, ‘\n’)
print(“Filtered Dates:\n”, filtered, ‘\n’)
print(“Date with Offset:\n”, df_offset.head(), ‘\n’)
print(“Missing Dates Handled:\n”, df_missing.head(), ‘\n’)

####

import pandas as pd

Sample DataFrame

df = pd.DataFrame({
‘a’: [1, 2, 3, 4],
‘b’: [10, 20, 30, 40],
‘first_name’: [‘John’, ‘Alice’, ‘Bob’, ‘Eve’],
‘last_name’: [‘Doe’, ‘Smith’, ‘Brown’, ‘White’]
})

🔹 1. Add two numeric columns

df[‘sum’] = df.apply(lambda row: row[‘a’] + row[‘b’], axis=1)

🔹 2. Conditional labeling using multiple columns

df[‘label’] = df.apply(
lambda row: ‘high’ if row[‘a’] + row[‘b’] > 25 else ‘low’,
axis=1
)

🔹 3. Create full name from first and last names

df[‘full_name’] = df.apply(lambda row: f”{row[‘first_name’]} {row[‘last_name’]}”, axis=1)

🔹 4. Create a new feature with a custom formula (e.g., weighted score)

df[‘weighted’] = df.apply(lambda row: 0.3 * row[‘a’] + 0.7 * row[‘b’], axis=1)

🔹 5. Use assign() + lambda for clean chaining (optional)

df = df.assign(
diff=lambda x: x[‘b’] – x[‘a’], # Difference between columns
score_label=lambda x: [‘pass’ if v >= 20 else ‘fail’ for v in x[‘weighted’]]
)

✅ Final Output

print(df)

####

# Remove emojis and non-ASCII characters
text = re.sub(r'[^\x00-\x7F]+’, ”, text)

# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()

# Remove stopwords
stop_words = set(stopwords.words('english'))
text = " ".join([word for word in text.split() if word not in stop_words])

# Stemming
stemmer = PorterStemmer()
text = " ".join([stemmer.stem(word) for word in text.split()])

# Lemmatization (optional, after stemming or instead of it)
lemmatizer = WordNetLemmatizer()
text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])

return text

df[‘cleaned_review’] = df[‘Review’].apply(clean_text)

STEP 4: Convert text to vectors using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df[‘cleaned_review’])

STEP 5: Encode target labels

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df[‘Sentiment’])

STEP 6: Train/test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

STEP 7: Train model

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

STEP 8: Predict and evaluate

from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test)
print(“Accuracy:”, accuracy_score(y_test, y_pred))
print(“Classification Report:\n”, classification_report(y_test, y_pred, target_names=le.classes_))