Stagergy 1
Tue 01 July 2025
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Confusion Matrix:
[[965 0]
[ 37 113]]
Classification Report:
precision recall f1-score support
0 0.96 1.00 0.98 965
1 1.00 0.75 0.86 150
accuracy 0.97 1115
macro avg 0.98 0.88 0.92 1115
weighted avg 0.97 0.97 0.96 1115
sample = ["Congratulations! You won a $1000 prize. Click here to claim."]
sample_tfidf = vectorizer.transform(sample)
print("\nPrediction:", "Spam" if model.predict(sample_tfidf)[0] == 1 else "Ham")
Prediction: Spam
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
categories = ['rec.sport.baseball', 'sci.space', 'comp.graphics', 'talk.politics.mideast']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
model = LinearSVC()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=categories))
Confusion Matrix:
[[199 13 8 0]
[ 3 169 3 3]
[ 6 12 165 6]
[ 3 9 1 179]]
Classification Report:
precision recall f1-score support
rec.sport.baseball 0.94 0.90 0.92 220
sci.space 0.83 0.95 0.89 178
comp.graphics 0.93 0.87 0.90 189
talk.politics.mideast 0.95 0.93 0.94 192
accuracy 0.91 779
macro avg 0.91 0.91 0.91 779
weighted avg 0.92 0.91 0.91 779
sample = ["NASA successfully launched a new satellite into orbit last night."]
sample_tfidf = vectorizer.transform(sample)
predicted_category = model.predict(sample_tfidf)[0]
print("\nPredicted Category:", categories[predicted_category])
Predicted Category: comp.graphics
import pandas as pd
import numpy as np
import requests
from io import BytesIO
import re
from bs4 import BeautifulSoup
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
nltk.download('stopwords')
from nltk.corpus import stopwords
[nltk_data] Downloading package stopwords to
[nltk_data] C:\Users\stefi\AppData\Roaming\nltk_data...
[nltk_data] Package stopwords is already up-to-date!
Score: 10
Category: basics