mirror of
https://github.com/deepseek-ai/DeepSeek-Coder
synced 2025-04-07 05:55:25 +00:00
1105 lines
81 KiB
Plaintext
1105 lines
81 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"provenance": [],
|
|
"include_colab_link": true
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "view-in-github",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"<a href=\"https://colab.research.google.com/github/Orrm23/DeepSeek-Coder/blob/main/26_SentimentalAnalysisNLP.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "Mkgh9SoBSOwH"
|
|
},
|
|
"source": [
|
|
"#26 Sentimental Analysis NLP"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "_UNRNEQQG9Sr",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"outputId": "be97a8fd-45a4-4ff2-d743-ad5ff1a29cf0"
|
|
},
|
|
"source": [
|
|
"!pip install nltk"
|
|
],
|
|
"execution_count": 1,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (3.9.1)\n",
|
|
"Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk) (8.1.8)\n",
|
|
"Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk) (1.4.2)\n",
|
|
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk) (2024.11.6)\n",
|
|
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk) (4.67.1)\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "v6-o7BjJTzVx"
|
|
},
|
|
"source": [
|
|
"### Importing Libraries"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "TLZbzW6wT2ZQ"
|
|
},
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import re #Regular expressions\n",
|
|
"import nltk\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"from nltk.corpus import stopwords\n",
|
|
"\n",
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
"\n",
|
|
"from sklearn.model_selection import train_test_split"
|
|
],
|
|
"execution_count": 2,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "NkPr6C5_UH9d"
|
|
},
|
|
"source": [
|
|
"### Load Dataset from Local Directory"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "F5rC2iqzUJEm",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 73
|
|
},
|
|
"outputId": "00a19f35-0083-435f-8a27-b21c6b2dd1d7"
|
|
},
|
|
"source": [
|
|
"from google.colab import files\n",
|
|
"uploaded = files.upload()"
|
|
],
|
|
"execution_count": 3,
|
|
"outputs": [
|
|
{
|
|
"output_type": "display_data",
|
|
"data": {
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
],
|
|
"text/html": [
|
|
"\n",
|
|
" <input type=\"file\" id=\"files-27f16a22-dbe6-404a-a90c-1db8d120a56c\" name=\"files[]\" multiple disabled\n",
|
|
" style=\"border:none\" />\n",
|
|
" <output id=\"result-27f16a22-dbe6-404a-a90c-1db8d120a56c\">\n",
|
|
" Upload widget is only available when the cell has been executed in the\n",
|
|
" current browser session. Please rerun this cell to enable.\n",
|
|
" </output>\n",
|
|
" <script>// Copyright 2017 Google LLC\n",
|
|
"//\n",
|
|
"// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
|
|
"// you may not use this file except in compliance with the License.\n",
|
|
"// You may obtain a copy of the License at\n",
|
|
"//\n",
|
|
"// http://www.apache.org/licenses/LICENSE-2.0\n",
|
|
"//\n",
|
|
"// Unless required by applicable law or agreed to in writing, software\n",
|
|
"// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
|
|
"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
|
|
"// See the License for the specific language governing permissions and\n",
|
|
"// limitations under the License.\n",
|
|
"\n",
|
|
"/**\n",
|
|
" * @fileoverview Helpers for google.colab Python module.\n",
|
|
" */\n",
|
|
"(function(scope) {\n",
|
|
"function span(text, styleAttributes = {}) {\n",
|
|
" const element = document.createElement('span');\n",
|
|
" element.textContent = text;\n",
|
|
" for (const key of Object.keys(styleAttributes)) {\n",
|
|
" element.style[key] = styleAttributes[key];\n",
|
|
" }\n",
|
|
" return element;\n",
|
|
"}\n",
|
|
"\n",
|
|
"// Max number of bytes which will be uploaded at a time.\n",
|
|
"const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
|
|
"\n",
|
|
"function _uploadFiles(inputId, outputId) {\n",
|
|
" const steps = uploadFilesStep(inputId, outputId);\n",
|
|
" const outputElement = document.getElementById(outputId);\n",
|
|
" // Cache steps on the outputElement to make it available for the next call\n",
|
|
" // to uploadFilesContinue from Python.\n",
|
|
" outputElement.steps = steps;\n",
|
|
"\n",
|
|
" return _uploadFilesContinue(outputId);\n",
|
|
"}\n",
|
|
"\n",
|
|
"// This is roughly an async generator (not supported in the browser yet),\n",
|
|
"// where there are multiple asynchronous steps and the Python side is going\n",
|
|
"// to poll for completion of each step.\n",
|
|
"// This uses a Promise to block the python side on completion of each step,\n",
|
|
"// then passes the result of the previous step as the input to the next step.\n",
|
|
"function _uploadFilesContinue(outputId) {\n",
|
|
" const outputElement = document.getElementById(outputId);\n",
|
|
" const steps = outputElement.steps;\n",
|
|
"\n",
|
|
" const next = steps.next(outputElement.lastPromiseValue);\n",
|
|
" return Promise.resolve(next.value.promise).then((value) => {\n",
|
|
" // Cache the last promise value to make it available to the next\n",
|
|
" // step of the generator.\n",
|
|
" outputElement.lastPromiseValue = value;\n",
|
|
" return next.value.response;\n",
|
|
" });\n",
|
|
"}\n",
|
|
"\n",
|
|
"/**\n",
|
|
" * Generator function which is called between each async step of the upload\n",
|
|
" * process.\n",
|
|
" * @param {string} inputId Element ID of the input file picker element.\n",
|
|
" * @param {string} outputId Element ID of the output display.\n",
|
|
" * @return {!Iterable<!Object>} Iterable of next steps.\n",
|
|
" */\n",
|
|
"function* uploadFilesStep(inputId, outputId) {\n",
|
|
" const inputElement = document.getElementById(inputId);\n",
|
|
" inputElement.disabled = false;\n",
|
|
"\n",
|
|
" const outputElement = document.getElementById(outputId);\n",
|
|
" outputElement.innerHTML = '';\n",
|
|
"\n",
|
|
" const pickedPromise = new Promise((resolve) => {\n",
|
|
" inputElement.addEventListener('change', (e) => {\n",
|
|
" resolve(e.target.files);\n",
|
|
" });\n",
|
|
" });\n",
|
|
"\n",
|
|
" const cancel = document.createElement('button');\n",
|
|
" inputElement.parentElement.appendChild(cancel);\n",
|
|
" cancel.textContent = 'Cancel upload';\n",
|
|
" const cancelPromise = new Promise((resolve) => {\n",
|
|
" cancel.onclick = () => {\n",
|
|
" resolve(null);\n",
|
|
" };\n",
|
|
" });\n",
|
|
"\n",
|
|
" // Wait for the user to pick the files.\n",
|
|
" const files = yield {\n",
|
|
" promise: Promise.race([pickedPromise, cancelPromise]),\n",
|
|
" response: {\n",
|
|
" action: 'starting',\n",
|
|
" }\n",
|
|
" };\n",
|
|
"\n",
|
|
" cancel.remove();\n",
|
|
"\n",
|
|
" // Disable the input element since further picks are not allowed.\n",
|
|
" inputElement.disabled = true;\n",
|
|
"\n",
|
|
" if (!files) {\n",
|
|
" return {\n",
|
|
" response: {\n",
|
|
" action: 'complete',\n",
|
|
" }\n",
|
|
" };\n",
|
|
" }\n",
|
|
"\n",
|
|
" for (const file of files) {\n",
|
|
" const li = document.createElement('li');\n",
|
|
" li.append(span(file.name, {fontWeight: 'bold'}));\n",
|
|
" li.append(span(\n",
|
|
" `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
|
|
" `last modified: ${\n",
|
|
" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
|
|
" 'n/a'} - `));\n",
|
|
" const percent = span('0% done');\n",
|
|
" li.appendChild(percent);\n",
|
|
"\n",
|
|
" outputElement.appendChild(li);\n",
|
|
"\n",
|
|
" const fileDataPromise = new Promise((resolve) => {\n",
|
|
" const reader = new FileReader();\n",
|
|
" reader.onload = (e) => {\n",
|
|
" resolve(e.target.result);\n",
|
|
" };\n",
|
|
" reader.readAsArrayBuffer(file);\n",
|
|
" });\n",
|
|
" // Wait for the data to be ready.\n",
|
|
" let fileData = yield {\n",
|
|
" promise: fileDataPromise,\n",
|
|
" response: {\n",
|
|
" action: 'continue',\n",
|
|
" }\n",
|
|
" };\n",
|
|
"\n",
|
|
" // Use a chunked sending to avoid message size limits. See b/62115660.\n",
|
|
" let position = 0;\n",
|
|
" do {\n",
|
|
" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
|
|
" const chunk = new Uint8Array(fileData, position, length);\n",
|
|
" position += length;\n",
|
|
"\n",
|
|
" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
|
|
" yield {\n",
|
|
" response: {\n",
|
|
" action: 'append',\n",
|
|
" file: file.name,\n",
|
|
" data: base64,\n",
|
|
" },\n",
|
|
" };\n",
|
|
"\n",
|
|
" let percentDone = fileData.byteLength === 0 ?\n",
|
|
" 100 :\n",
|
|
" Math.round((position / fileData.byteLength) * 100);\n",
|
|
" percent.textContent = `${percentDone}% done`;\n",
|
|
"\n",
|
|
" } while (position < fileData.byteLength);\n",
|
|
" }\n",
|
|
"\n",
|
|
" // All done.\n",
|
|
" yield {\n",
|
|
" response: {\n",
|
|
" action: 'complete',\n",
|
|
" }\n",
|
|
" };\n",
|
|
"}\n",
|
|
"\n",
|
|
"scope.google = scope.google || {};\n",
|
|
"scope.google.colab = scope.google.colab || {};\n",
|
|
"scope.google.colab._files = {\n",
|
|
" _uploadFiles,\n",
|
|
" _uploadFilesContinue,\n",
|
|
"};\n",
|
|
"})(self);\n",
|
|
"</script> "
|
|
]
|
|
},
|
|
"metadata": {}
|
|
},
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"Saving dataset.csv to dataset.csv\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "aK8lU0OUUlX0"
|
|
},
|
|
"source": [
|
|
"### Importing Dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "d2I6egUxUoaq",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"outputId": "ca87b21b-2c27-4661-a0fa-eb682b4a1e60"
|
|
},
|
|
"source": [
|
|
"dataset = pd.read_csv('dataset.csv')\n",
|
|
"print(dataset.shape)\n",
|
|
"print(dataset.head(5))"
|
|
],
|
|
"execution_count": 4,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"(14640, 15)\n",
|
|
" tweet_id airline_sentiment airline_sentiment_confidence \\\n",
|
|
"0 570306133677760513 neutral 1.0000 \n",
|
|
"1 570301130888122368 positive 0.3486 \n",
|
|
"2 570301083672813571 neutral 0.6837 \n",
|
|
"3 570301031407624196 negative 1.0000 \n",
|
|
"4 570300817074462722 negative 1.0000 \n",
|
|
"\n",
|
|
" negativereason negativereason_confidence airline \\\n",
|
|
"0 NaN NaN Virgin America \n",
|
|
"1 NaN 0.0000 Virgin America \n",
|
|
"2 NaN NaN Virgin America \n",
|
|
"3 Bad Flight 0.7033 Virgin America \n",
|
|
"4 Can't Tell 1.0000 Virgin America \n",
|
|
"\n",
|
|
" airline_sentiment_gold name negativereason_gold retweet_count \\\n",
|
|
"0 NaN cairdin NaN 0 \n",
|
|
"1 NaN jnardino NaN 0 \n",
|
|
"2 NaN yvonnalynn NaN 0 \n",
|
|
"3 NaN jnardino NaN 0 \n",
|
|
"4 NaN jnardino NaN 0 \n",
|
|
"\n",
|
|
" text tweet_coord \\\n",
|
|
"0 @VirginAmerica What @dhepburn said. NaN \n",
|
|
"1 @VirginAmerica plus you've added commercials t... NaN \n",
|
|
"2 @VirginAmerica I didn't today... Must mean I n... NaN \n",
|
|
"3 @VirginAmerica it's really aggressive to blast... NaN \n",
|
|
"4 @VirginAmerica and it's a really big bad thing... NaN \n",
|
|
"\n",
|
|
" tweet_created tweet_location user_timezone \n",
|
|
"0 2015-02-24 11:35:52 -0800 NaN Eastern Time (US & Canada) \n",
|
|
"1 2015-02-24 11:15:59 -0800 NaN Pacific Time (US & Canada) \n",
|
|
"2 2015-02-24 11:15:48 -0800 Lets Play Central Time (US & Canada) \n",
|
|
"3 2015-02-24 11:15:36 -0800 NaN Pacific Time (US & Canada) \n",
|
|
"4 2015-02-24 11:14:45 -0800 NaN Pacific Time (US & Canada) \n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "cfM237zDUwSx"
|
|
},
|
|
"source": [
|
|
"###Segregating Dataset into Input & Output"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "ReRaankPU1f0",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"outputId": "8af748d8-bde1-449f-decc-246268f7716d"
|
|
},
|
|
"source": [
|
|
"features = dataset.iloc[:, 10].values\n",
|
|
"labels = dataset.iloc[:, 1].values\n",
|
|
"print(labels)"
|
|
],
|
|
"execution_count": 5,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"['neutral' 'positive' 'neutral' ... 'neutral' 'negative' 'neutral']\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "KQwlfdWsj2LT"
|
|
},
|
|
"source": [
|
|
"###Removing the Special Character"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "uhesmp0CU9xc"
|
|
},
|
|
"source": [
|
|
"processed_features = []\n",
|
|
"\n",
|
|
"for sentence in range(0, len(features)):\n",
|
|
" # Remove all the special characters\n",
|
|
" processed_feature = re.sub(r'\\W', ' ', str(features[sentence]))\n",
|
|
"\n",
|
|
" # remove all single characters\n",
|
|
" processed_feature= re.sub(r'\\s+[a-zA-Z]\\s+', ' ', processed_feature)\n",
|
|
"\n",
|
|
" # Remove single characters from the start\n",
|
|
" processed_feature = re.sub(r'\\^[a-zA-Z]\\s+', ' ', processed_feature)\n",
|
|
"\n",
|
|
" # Substituting multiple spaces with single space\n",
|
|
" processed_feature = re.sub(r'\\s+', ' ', processed_feature, flags=re.I)\n",
|
|
"\n",
|
|
" # Removing prefixed 'b'\n",
|
|
" processed_feature = re.sub(r'^b\\s+', '', processed_feature)\n",
|
|
"\n",
|
|
" # Converting to Lowercase\n",
|
|
" processed_feature = processed_feature.lower()\n",
|
|
"\n",
|
|
" processed_features.append(processed_feature)"
|
|
],
|
|
"execution_count": 6,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "meD0mcOVj5rK"
|
|
},
|
|
"source": [
|
|
"###Feature Extraction from text\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "zzisF0taVA_b",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"outputId": "0bce940a-6261-46ab-c791-4698b6e37807"
|
|
},
|
|
"source": [
|
|
"nltk.download('stopwords')\n",
|
|
"vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))\n",
|
|
"processed_features = vectorizer.fit_transform(processed_features).toarray()\n",
|
|
"print(processed_features)"
|
|
],
|
|
"execution_count": 7,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stderr",
|
|
"text": [
|
|
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
|
|
"[nltk_data] Unzipping corpora/stopwords.zip.\n"
|
|
]
|
|
},
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"[[0. 0. 0. ... 0. 0. 0.]\n",
|
|
" [0. 0. 0. ... 0. 0. 0.]\n",
|
|
" [0. 0. 0. ... 0. 0. 0.]\n",
|
|
" ...\n",
|
|
" [0. 0. 0. ... 0. 0. 0.]\n",
|
|
" [0. 0. 0. ... 0. 0. 0.]\n",
|
|
" [0. 0. 0. ... 0. 0. 0.]]\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "n2vFlF_fj-SK"
|
|
},
|
|
"source": [
|
|
"###Splitting Dataset into Train & Test"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "Q4fpb6RmVI0t"
|
|
},
|
|
"source": [
|
|
"X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)"
|
|
],
|
|
"execution_count": 8,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "z9zzG3lDkC9L"
|
|
},
|
|
"source": [
|
|
"###Loading Random Forest Algorithm"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "0TG77VbjVK7H",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 80
|
|
},
|
|
"outputId": "67fca259-b904-426c-dacb-fb233a53040c"
|
|
},
|
|
"source": [
|
|
"text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)\n",
|
|
"text_classifier.fit(X_train, y_train)"
|
|
],
|
|
"execution_count": 9,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/plain": [
|
|
"RandomForestClassifier(n_estimators=200, random_state=0)"
|
|
],
|
|
"text/html": [
|
|
"<style>#sk-container-id-1 {\n",
|
|
" /* Definition of color scheme common for light and dark mode */\n",
|
|
" --sklearn-color-text: #000;\n",
|
|
" --sklearn-color-text-muted: #666;\n",
|
|
" --sklearn-color-line: gray;\n",
|
|
" /* Definition of color scheme for unfitted estimators */\n",
|
|
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
|
|
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
|
|
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
|
|
" --sklearn-color-unfitted-level-3: chocolate;\n",
|
|
" /* Definition of color scheme for fitted estimators */\n",
|
|
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
|
|
" --sklearn-color-fitted-level-1: #d4ebff;\n",
|
|
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
|
|
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
|
|
"\n",
|
|
" /* Specific color for light theme */\n",
|
|
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
|
|
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
|
|
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
|
|
" --sklearn-color-icon: #696969;\n",
|
|
"\n",
|
|
" @media (prefers-color-scheme: dark) {\n",
|
|
" /* Redefinition of color scheme for dark theme */\n",
|
|
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
|
|
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
|
|
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
|
|
" --sklearn-color-icon: #878787;\n",
|
|
" }\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 {\n",
|
|
" color: var(--sklearn-color-text);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 pre {\n",
|
|
" padding: 0;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 input.sk-hidden--visually {\n",
|
|
" border: 0;\n",
|
|
" clip: rect(1px 1px 1px 1px);\n",
|
|
" clip: rect(1px, 1px, 1px, 1px);\n",
|
|
" height: 1px;\n",
|
|
" margin: -1px;\n",
|
|
" overflow: hidden;\n",
|
|
" padding: 0;\n",
|
|
" position: absolute;\n",
|
|
" width: 1px;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-dashed-wrapped {\n",
|
|
" border: 1px dashed var(--sklearn-color-line);\n",
|
|
" margin: 0 0.4em 0.5em 0.4em;\n",
|
|
" box-sizing: border-box;\n",
|
|
" padding-bottom: 0.4em;\n",
|
|
" background-color: var(--sklearn-color-background);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-container {\n",
|
|
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
|
|
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
|
|
" so we also need the `!important` here to be able to override the\n",
|
|
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
|
|
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
|
|
" display: inline-block !important;\n",
|
|
" position: relative;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-text-repr-fallback {\n",
|
|
" display: none;\n",
|
|
"}\n",
|
|
"\n",
|
|
"div.sk-parallel-item,\n",
|
|
"div.sk-serial,\n",
|
|
"div.sk-item {\n",
|
|
" /* draw centered vertical line to link estimators */\n",
|
|
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
|
|
" background-size: 2px 100%;\n",
|
|
" background-repeat: no-repeat;\n",
|
|
" background-position: center center;\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Parallel-specific style estimator block */\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-parallel-item::after {\n",
|
|
" content: \"\";\n",
|
|
" width: 100%;\n",
|
|
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
|
|
" flex-grow: 1;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-parallel {\n",
|
|
" display: flex;\n",
|
|
" align-items: stretch;\n",
|
|
" justify-content: center;\n",
|
|
" background-color: var(--sklearn-color-background);\n",
|
|
" position: relative;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-parallel-item {\n",
|
|
" display: flex;\n",
|
|
" flex-direction: column;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
|
|
" align-self: flex-end;\n",
|
|
" width: 50%;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
|
|
" align-self: flex-start;\n",
|
|
" width: 50%;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
|
|
" width: 0;\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Serial-specific style estimator block */\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-serial {\n",
|
|
" display: flex;\n",
|
|
" flex-direction: column;\n",
|
|
" align-items: center;\n",
|
|
" background-color: var(--sklearn-color-background);\n",
|
|
" padding-right: 1em;\n",
|
|
" padding-left: 1em;\n",
|
|
"}\n",
|
|
"\n",
|
|
"\n",
|
|
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
|
|
"clickable and can be expanded/collapsed.\n",
|
|
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
|
|
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
|
|
"*/\n",
|
|
"\n",
|
|
"/* Pipeline and ColumnTransformer style (default) */\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-toggleable {\n",
|
|
" /* Default theme specific background. It is overwritten whether we have a\n",
|
|
" specific estimator or a Pipeline/ColumnTransformer */\n",
|
|
" background-color: var(--sklearn-color-background);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Toggleable label */\n",
|
|
"#sk-container-id-1 label.sk-toggleable__label {\n",
|
|
" cursor: pointer;\n",
|
|
" display: flex;\n",
|
|
" width: 100%;\n",
|
|
" margin-bottom: 0;\n",
|
|
" padding: 0.5em;\n",
|
|
" box-sizing: border-box;\n",
|
|
" text-align: center;\n",
|
|
" align-items: start;\n",
|
|
" justify-content: space-between;\n",
|
|
" gap: 0.5em;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 label.sk-toggleable__label .caption {\n",
|
|
" font-size: 0.6rem;\n",
|
|
" font-weight: lighter;\n",
|
|
" color: var(--sklearn-color-text-muted);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
|
|
" /* Arrow on the left of the label */\n",
|
|
" content: \"▸\";\n",
|
|
" float: left;\n",
|
|
" margin-right: 0.25em;\n",
|
|
" color: var(--sklearn-color-icon);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
|
|
" color: var(--sklearn-color-text);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Toggleable content - dropdown */\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-toggleable__content {\n",
|
|
" max-height: 0;\n",
|
|
" max-width: 0;\n",
|
|
" overflow: hidden;\n",
|
|
" text-align: left;\n",
|
|
" /* unfitted */\n",
|
|
" background-color: var(--sklearn-color-unfitted-level-0);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
|
|
" /* fitted */\n",
|
|
" background-color: var(--sklearn-color-fitted-level-0);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-toggleable__content pre {\n",
|
|
" margin: 0.2em;\n",
|
|
" border-radius: 0.25em;\n",
|
|
" color: var(--sklearn-color-text);\n",
|
|
" /* unfitted */\n",
|
|
" background-color: var(--sklearn-color-unfitted-level-0);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
|
|
" /* unfitted */\n",
|
|
" background-color: var(--sklearn-color-fitted-level-0);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
|
|
" /* Expand drop-down */\n",
|
|
" max-height: 200px;\n",
|
|
" max-width: 100%;\n",
|
|
" overflow: auto;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
|
|
" content: \"▾\";\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Pipeline/ColumnTransformer-specific style */\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
|
" color: var(--sklearn-color-text);\n",
|
|
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
|
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Estimator-specific style */\n",
|
|
"\n",
|
|
"/* Colorize estimator box */\n",
|
|
"#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
|
" /* unfitted */\n",
|
|
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
|
" /* fitted */\n",
|
|
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
|
|
"#sk-container-id-1 div.sk-label label {\n",
|
|
" /* The background is the default theme color */\n",
|
|
" color: var(--sklearn-color-text-on-default-background);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* On hover, darken the color of the background */\n",
|
|
"#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
|
|
" color: var(--sklearn-color-text);\n",
|
|
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Label box, darken color on hover, fitted */\n",
|
|
"#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
|
|
" color: var(--sklearn-color-text);\n",
|
|
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Estimator label */\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-label label {\n",
|
|
" font-family: monospace;\n",
|
|
" font-weight: bold;\n",
|
|
" display: inline-block;\n",
|
|
" line-height: 1.2em;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-label-container {\n",
|
|
" text-align: center;\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Estimator-specific */\n",
|
|
"#sk-container-id-1 div.sk-estimator {\n",
|
|
" font-family: monospace;\n",
|
|
" border: 1px dotted var(--sklearn-color-border-box);\n",
|
|
" border-radius: 0.25em;\n",
|
|
" box-sizing: border-box;\n",
|
|
" margin-bottom: 0.5em;\n",
|
|
" /* unfitted */\n",
|
|
" background-color: var(--sklearn-color-unfitted-level-0);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-estimator.fitted {\n",
|
|
" /* fitted */\n",
|
|
" background-color: var(--sklearn-color-fitted-level-0);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* on hover */\n",
|
|
"#sk-container-id-1 div.sk-estimator:hover {\n",
|
|
" /* unfitted */\n",
|
|
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
|
|
" /* fitted */\n",
|
|
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
|
|
"\n",
|
|
"/* Common style for \"i\" and \"?\" */\n",
|
|
"\n",
|
|
".sk-estimator-doc-link,\n",
|
|
"a:link.sk-estimator-doc-link,\n",
|
|
"a:visited.sk-estimator-doc-link {\n",
|
|
" float: right;\n",
|
|
" font-size: smaller;\n",
|
|
" line-height: 1em;\n",
|
|
" font-family: monospace;\n",
|
|
" background-color: var(--sklearn-color-background);\n",
|
|
" border-radius: 1em;\n",
|
|
" height: 1em;\n",
|
|
" width: 1em;\n",
|
|
" text-decoration: none !important;\n",
|
|
" margin-left: 0.5em;\n",
|
|
" text-align: center;\n",
|
|
" /* unfitted */\n",
|
|
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
|
|
" color: var(--sklearn-color-unfitted-level-1);\n",
|
|
"}\n",
|
|
"\n",
|
|
".sk-estimator-doc-link.fitted,\n",
|
|
"a:link.sk-estimator-doc-link.fitted,\n",
|
|
"a:visited.sk-estimator-doc-link.fitted {\n",
|
|
" /* fitted */\n",
|
|
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
|
|
" color: var(--sklearn-color-fitted-level-1);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* On hover */\n",
|
|
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
|
|
".sk-estimator-doc-link:hover,\n",
|
|
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
|
|
".sk-estimator-doc-link:hover {\n",
|
|
" /* unfitted */\n",
|
|
" background-color: var(--sklearn-color-unfitted-level-3);\n",
|
|
" color: var(--sklearn-color-background);\n",
|
|
" text-decoration: none;\n",
|
|
"}\n",
|
|
"\n",
|
|
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
|
|
".sk-estimator-doc-link.fitted:hover,\n",
|
|
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
|
|
".sk-estimator-doc-link.fitted:hover {\n",
|
|
" /* fitted */\n",
|
|
" background-color: var(--sklearn-color-fitted-level-3);\n",
|
|
" color: var(--sklearn-color-background);\n",
|
|
" text-decoration: none;\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* Span, style for the box shown on hovering the info icon */\n",
|
|
".sk-estimator-doc-link span {\n",
|
|
" display: none;\n",
|
|
" z-index: 9999;\n",
|
|
" position: relative;\n",
|
|
" font-weight: normal;\n",
|
|
" right: .2ex;\n",
|
|
" padding: .5ex;\n",
|
|
" margin: .5ex;\n",
|
|
" width: min-content;\n",
|
|
" min-width: 20ex;\n",
|
|
" max-width: 50ex;\n",
|
|
" color: var(--sklearn-color-text);\n",
|
|
" box-shadow: 2pt 2pt 4pt #999;\n",
|
|
" /* unfitted */\n",
|
|
" background: var(--sklearn-color-unfitted-level-0);\n",
|
|
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
|
|
"}\n",
|
|
"\n",
|
|
".sk-estimator-doc-link.fitted span {\n",
|
|
" /* fitted */\n",
|
|
" background: var(--sklearn-color-fitted-level-0);\n",
|
|
" border: var(--sklearn-color-fitted-level-3);\n",
|
|
"}\n",
|
|
"\n",
|
|
".sk-estimator-doc-link:hover span {\n",
|
|
" display: block;\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
|
|
"\n",
|
|
"#sk-container-id-1 a.estimator_doc_link {\n",
|
|
" float: right;\n",
|
|
" font-size: 1rem;\n",
|
|
" line-height: 1em;\n",
|
|
" font-family: monospace;\n",
|
|
" background-color: var(--sklearn-color-background);\n",
|
|
" border-radius: 1rem;\n",
|
|
" height: 1rem;\n",
|
|
" width: 1rem;\n",
|
|
" text-decoration: none;\n",
|
|
" /* unfitted */\n",
|
|
" color: var(--sklearn-color-unfitted-level-1);\n",
|
|
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 a.estimator_doc_link.fitted {\n",
|
|
" /* fitted */\n",
|
|
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
|
|
" color: var(--sklearn-color-fitted-level-1);\n",
|
|
"}\n",
|
|
"\n",
|
|
"/* On hover */\n",
|
|
"#sk-container-id-1 a.estimator_doc_link:hover {\n",
|
|
" /* unfitted */\n",
|
|
" background-color: var(--sklearn-color-unfitted-level-3);\n",
|
|
" color: var(--sklearn-color-background);\n",
|
|
" text-decoration: none;\n",
|
|
"}\n",
|
|
"\n",
|
|
"#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
|
|
" /* fitted */\n",
|
|
" background-color: var(--sklearn-color-fitted-level-3);\n",
|
|
"}\n",
|
|
"</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(n_estimators=200, random_state=0)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>RandomForestClassifier</div></div><div><a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.6/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></div></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestClassifier(n_estimators=200, random_state=0)</pre></div> </div></div></div></div>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"execution_count": 9
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "s-cE4paSkH69"
|
|
},
|
|
"source": [
|
|
"###Predicting the Test data with Trained Model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "JjNeO6rQVMfr"
|
|
},
|
|
"source": [
|
|
"predictions = text_classifier.predict(X_test)"
|
|
],
|
|
"execution_count": 10,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "WzNioePqkMnH"
|
|
},
|
|
"source": [
|
|
"###Score of the Model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "o2TY6JdyVOXn",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"outputId": "8f600ab5-7d7e-453c-c57e-4d9db5a1fb86"
|
|
},
|
|
"source": [
|
|
"print(accuracy_score(y_test, predictions))"
|
|
],
|
|
"execution_count": 11,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"0.7592213114754098\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "MGL1jroOkRaR"
|
|
},
|
|
"source": [
|
|
"###Confusion Matrix"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "Skyz1_qpVQgl",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 506
|
|
},
|
|
"outputId": "b78c2ba2-5d35-4535-f99c-d6ee7f891fd1"
|
|
},
|
|
"source": [
|
|
"from sklearn import metrics\n",
|
|
"import itertools\n",
|
|
"def plot_confusion_matrix(cm, classes,\n",
|
|
" normalize=False,\n",
|
|
" title='Confusion matrix',\n",
|
|
" cmap=plt.cm.Blues):\n",
|
|
"\n",
|
|
" plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
|
|
" plt.title(title)\n",
|
|
" plt.colorbar()\n",
|
|
" tick_marks = np.arange(len(classes))\n",
|
|
" plt.xticks(tick_marks, classes)\n",
|
|
" plt.yticks(tick_marks, classes)\n",
|
|
"\n",
|
|
" thresh = cm.max() / 2.\n",
|
|
" for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
|
|
" plt.text(j, i, cm[i, j],\n",
|
|
" horizontalalignment=\"center\",\n",
|
|
" color=\"white\" if cm[i, j] > thresh else \"black\")\n",
|
|
"\n",
|
|
" plt.tight_layout()\n",
|
|
" plt.ylabel('True label')\n",
|
|
" plt.xlabel('Predicted label')\n",
|
|
"\n",
|
|
"cm = metrics.confusion_matrix(y_test, predictions, labels=['negative', 'neutral', 'positive'])\n",
|
|
"plot_confusion_matrix(cm, classes=['negative', 'neutral', 'positive'])"
|
|
],
|
|
"execution_count": 12,
|
|
"outputs": [
|
|
{
|
|
"output_type": "display_data",
|
|
"data": {
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 2 Axes>"
|
|
],
|
|
"image/png": "\n"
|
|
},
|
|
"metadata": {}
|
|
}
|
|
]
|
|
}
|
|
]
|
|
} |