Test4

Tue 01 July 2025
import pandas as pd
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
df = pd.read_csv('IMDB Dataset.csv',
    quoting=csv.QUOTE_MINIMAL,
    on_bad_lines='skip',
    encoding='utf-8')
df.head()
review sentiment
0 One of the other reviewers has mentioned that ... positive
1 A wonderful little production. <br /><br />The... positive
2 I thought this was a wonderful way to spend ti... positive
3 Basically there's a family where a little boy ... negative
4 Petter Mattei's "Love in the Time of Money" is... positive
df.isnull().sum()
review       0
sentiment    0
dtype: int64
df['sentiment'].value_counts().plot(kind='bar', color=['green', 'red'], title='Sentiment Distribution')
plt.show()

png

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

df.head()
review sentiment
0 One of the other reviewers has mentioned that ... 1
1 A wonderful little production. <br /><br />The... 1
2 I thought this was a wonderful way to spend ti... 1
3 Basically there's a family where a little boy ... 0
4 Petter Mattei's "Love in the Time of Money" is... 1
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_train_tfidf.shape, X_test_tfidf.shape
((40000, 10000), (10000, 10000))
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred = model.predict(X_test_tfidf)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Negative","Positive"]))
Confusion Matrix:
[[4330  631]
 [ 459 4580]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.87      0.89      4961
    Positive       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


Score: 10

Category: basics