6 Commits

Author SHA1 Message Date
Towfiq
0c8b457eee chore(release): 0.2.2 2022-12-25 15:59:12 +06:00
Towfiq
123ad81dae fix: Fixes bug that prevents Saving API settings
fixes: issue #45
2022-12-25 15:58:44 +06:00
Towfiq
6c48b87e5e chore(release): 0.2.1 2022-12-24 20:28:07 +06:00
Towfiq
cf8b2c6913 refactor: Scraper 2022-12-24 20:27:55 +06:00
Towfiq
6d6e2f63d0 fixes: Broken Proxy Scraper
fixes #33
2022-12-24 08:29:44 +06:00
Towfiq
74c9603293 fixes: Wrong position change indicator when >100 2022-12-24 08:28:09 +06:00
15 changed files with 247 additions and 126 deletions

View File

@@ -2,6 +2,15 @@
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
### [0.2.2](https://github.com/towfiqi/serpbear/compare/v0.2.1...v0.2.2) (2022-12-25)
### Bug Fixes
* Fixes bug that prevents Saving API settings ([123ad81](https://github.com/towfiqi/serpbear/commit/123ad81dae10aa28848148d0f3da5cf1f7de7c57)), closes [#45](https://github.com/towfiqi/serpbear/issues/45)
### [0.2.1](https://github.com/towfiqi/serpbear/compare/v0.2.0...v0.2.1) (2022-12-24)
## [0.2.0](https://github.com/towfiqi/serpbear/compare/v0.1.7...v0.2.0) (2022-12-21)

View File

@@ -56,6 +56,9 @@ const Keyword = (props: KeywordProps) => {
const historySorted = historyArray.sort((a, b) => a.date - b.date);
const previousPos = historySorted[historySorted.length - 2].position;
status = previousPos === 0 ? position : previousPos - position;
if (position === 0 && previousPos > 0) {
status = previousPos - 100;
}
}
return status;
}, [history, position]);

View File

@@ -63,21 +63,20 @@ const Settings = ({ closeSettings }:SettingsProps) => {
const performUpdate = () => {
let error: null|SettingsError = null;
if (settings.notification_interval !== 'never') {
const { notification_interval, notification_email, notification_email_from, scraper_type, smtp_port, smtp_server, scaping_api } = settings;
if (notification_interval !== 'never') {
if (!settings.notification_email) {
error = { type: 'no_email', msg: 'Insert a Valid Email address' };
}
if (settings.notification_email
&& (!settings.smtp_port || !settings.smtp_server
|| !settings.notification_email_from)) {
if (notification_email && (!smtp_port || !smtp_server || !notification_email_from)) {
let type = 'no_smtp_from';
if (!settings.smtp_port) { type = 'no_smtp_port'; }
if (!settings.smtp_server) { type = 'no_smtp_server'; }
if (!smtp_port) { type = 'no_smtp_port'; }
if (!smtp_server) { type = 'no_smtp_server'; }
error = { type, msg: 'Insert SMTP Server details that will be used to send the emails.' };
}
}
if (['scrapingant', 'scrapingrobot', 'serply', 'serpapi'].includes(settings.scraper_type) && !settings.scaping_api) {
if (scraper_type !== 'proxy' && scraper_type !== 'none' && !scaping_api) {
error = { type: 'no_api_key', msg: 'Insert a Valid API Key or Token for the Scraper Service.' };
}
@@ -98,14 +97,8 @@ const Settings = ({ closeSettings }:SettingsProps) => {
{ label: 'Monthly', value: 'monthly' },
{ label: 'Never', value: 'never' },
];
const scraperOptions: SelectionOption[] = [
{ label: 'None', value: 'none' },
{ label: 'Proxy', value: 'proxy' },
{ label: 'ScrapingAnt.com', value: 'scrapingant' },
{ label: 'ScrapingRobot.com', value: 'scrapingrobot' },
{ label: 'serply.io', value: 'serply' },
{ label: 'serpapi.com', value: 'serpapi' },
];
const allScrapers: SelectionOption[] = settings.available_scapers ? settings.available_scapers : [];
const scraperOptions: SelectionOption[] = [{ label: 'None', value: 'none' }, ...allScrapers];
const tabStyle = 'inline-block px-4 py-1 rounded-full mr-3 cursor-pointer text-sm';
return (

4
package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "serpbear",
"version": "0.2.0",
"version": "0.2.2",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "serpbear",
"version": "0.2.0",
"version": "0.2.2",
"dependencies": {
"@googleapis/searchconsole": "^1.0.0",
"@testing-library/react": "^13.4.0",

View File

@@ -1,6 +1,6 @@
{
"name": "serpbear",
"version": "0.2.0",
"version": "0.2.2",
"private": true,
"scripts": {
"dev": "next dev",

View File

@@ -2,6 +2,7 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import Cryptr from 'cryptr';
import { writeFile, readFile } from 'fs/promises';
import verifyUser from '../../utils/verifyUser';
import allScrapers from '../../scrapers/index';
type SettingsGetResponse = {
settings?: object | null,
@@ -65,6 +66,7 @@ export const getAppSettings = async () : Promise<SettingsType> => {
scaping_api,
smtp_password,
search_console_integrated: !!(process.env.SEARCH_CONSOLE_PRIVATE_KEY && process.env.SEARCH_CONSOLE_CLIENT_EMAIL),
available_scapers: allScrapers.map((scraper) => ({ label: scraper.name, value: scraper.id })),
};
} catch (error) {
console.log('Error Decrypting Settings API Keys!');

13
scrapers/index.ts Normal file
View File

@@ -0,0 +1,13 @@
import scrapingAnt from './services/scrapingant';
import scrapingRobot from './services/scrapingrobot';
import serpapi from './services/serpapi';
import serply from './services/serply';
import proxy from './services/proxy';
export default [
scrapingRobot,
scrapingAnt,
serpapi,
serply,
proxy,
];

View File

@@ -0,0 +1,35 @@
import cheerio from 'cheerio';
const proxy:ScraperSettings = {
id: 'proxy',
name: 'Proxy',
website: '',
resultObjectKey: 'data',
headers: () => {
return { Accept: 'gzip,deflate,compress;' };
},
scrapeURL: (keyword) => {
return `https://www.google.com/search?num=100&q=${encodeURI(keyword.keyword)}`;
},
serpExtractor: (content) => {
const extractedResult = [];
const $ = cheerio.load(content);
let lastPosition = 0;
const mainContent = $('body').find('#main');
const children = $(mainContent).find('h3');
for (let index = 0; index < children.length; index += 1) {
const title = $(children[index]).text();
const url = $(children[index]).closest('a').attr('href');
const cleanedURL = url ? url.replaceAll(/^.+?(?=https:|$)/g, '').replaceAll(/(&).*/g, '') : '';
if (title && url) {
lastPosition += 1;
extractedResult.push({ title, url: cleanedURL, position: lastPosition });
}
}
return extractedResult;
},
};
export default proxy;

View File

@@ -0,0 +1,14 @@
const scrapingAnt:ScraperSettings = {
id: 'scrapingant',
name: 'ScrapingAnt',
website: 'scrapingant.com',
scrapeURL: (keyword, settings, countryData) => {
const scraperCountries = ['AE', 'BR', 'CN', 'DE', 'ES', 'FR', 'GB', 'HK', 'PL', 'IN', 'IT', 'IL', 'JP', 'NL', 'RU', 'SA', 'US', 'CZ'];
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
const lang = countryData[country][2];
return `https://api.scrapingant.com/v2/extended?url=https%3A%2F%2Fwww.google.com%2Fsearch%3Fnum%3D100%26hl%3D${lang}%26q%3D${encodeURI(keyword.keyword)}&x-api-key=${settings.scaping_api}&proxy_country=${country}&browser=false`;
},
resultObjectKey: 'result',
};
export default scrapingAnt;

View File

@@ -0,0 +1,13 @@
const scrapingRobot:ScraperSettings = {
id: 'scrapingrobot',
name: 'Scraping Robot',
website: 'scrapingrobot.com',
scrapeURL: (keyword, settings, countryData) => {
const country = keyword.country || 'US';
const lang = countryData[country][2];
return `https://api.scrapingrobot.com/?token=${settings.scaping_api}&proxyCountry=${country}&render=false${keyword.device === 'mobile' ? '&mobile=true' : ''}&url=https%3A%2F%2Fwww.google.com%2Fsearch%3Fnum%3D100%26hl%3D${lang}%26q%3D${encodeURI(keyword.keyword)}`;
},
resultObjectKey: 'result',
};
export default scrapingRobot;

View File

@@ -0,0 +1,38 @@
interface SerpApiResult {
title: string,
link: string,
position: number,
}
const serpapi:ScraperSettings = {
id: 'serpapi',
name: 'SerpApi.com',
website: 'serpapi.com',
headers: (keyword, settings) => {
return {
'Content-Type': 'application/json',
'X-API-Key': settings.scaping_api,
};
},
scrapeURL: (keyword, settings) => {
return `https://serpapi.com/search?q=${encodeURI(keyword.keyword)}&num=100&gl=${keyword.country}&device=${keyword.device}&api_key=${settings.scaping_api}`;
},
resultObjectKey: 'organic_results',
serpExtractor: (content) => {
const extractedResult = [];
const results: SerpApiResult[] = (typeof content === 'string') ? JSON.parse(content) : content as SerpApiResult[];
for (const { link, title, position } of results) {
if (title && link) {
extractedResult.push({
title,
url: link,
position,
});
}
}
return extractedResult;
},
};
export default serpapi;

View File

@@ -0,0 +1,42 @@
interface SerplyResult {
title: string,
link: string,
realPosition: number,
}
const scraperCountries = ['US', 'CA', 'IE', 'GB', 'FR', 'DE', 'SE', 'IN', 'JP', 'KR', 'SG', 'AU', 'BR'];
const serply:ScraperSettings = {
id: 'serply',
name: 'Serply',
website: 'serply.io',
headers: (keyword, settings) => {
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
return {
'Content-Type': 'application/json',
'X-User-Agent': keyword.device === 'mobile' ? 'mobile' : 'desktop',
'X-Api-Key': settings.scaping_api,
'X-Proxy-Location': country,
};
},
scrapeURL: (keyword) => {
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
return `https://api.serply.io/v1/search/q=${encodeURI(keyword.keyword)}&num=100&hl=${country}`;
},
resultObjectKey: 'result',
serpExtractor: (content) => {
const extractedResult = [];
const results: SerplyResult[] = (typeof content === 'string') ? JSON.parse(content) : content as SerplyResult[];
for (const result of results) {
if (result.title && result.link) {
extractedResult.push({
title: result.title,
url: result.link,
position: result.realPosition,
});
}
}
return extractedResult;
},
};
export default serply;

18
types.d.ts vendored
View File

@@ -54,7 +54,7 @@ type KeywordFilters = {
}
type countryData = {
[ISO:string] : string[]
[ISO:string] : [countryName:string, cityName:string, language:string]
}
type countryCodeData = {
@@ -78,6 +78,7 @@ type SettingsType = {
smtp_username?: string,
smtp_password?: string,
search_console_integrated?: boolean,
available_scapers?: Array
}
type KeywordSCDataChild = {
@@ -163,3 +164,18 @@ type SCDomainDataType = {
}
type SCKeywordType = SearchAnalyticsItem;
type scraperExtractedItem = {
title: string,
url: string,
position: number,
}
interface ScraperSettings {
id:string,
name:string,
website:string,
resultObjectKey: string,
headers?(keyword:KeywordType, settings: SettingsType): Object,
scrapeURL?(keyword:KeywordType, settings:SettingsType, countries:countryData): string,
serpExtractor?(content:string): scraperExtractedItem[],
}

View File

@@ -60,7 +60,10 @@ const getPositionChange = (history:KeywordHistory, position:number) : number =>
}));
const historySorted = historyArray.sort((a, b) => a.date - b.date);
const previousPos = historySorted[historySorted.length - 2].position;
status = previousPos - position;
status = previousPos === 0 ? position : previousPos - position;
if (position === 0 && previousPos > 0) {
status = previousPos - 100;
}
}
return status;
};

View File

@@ -1,10 +1,9 @@
import axios, { AxiosResponse, CreateAxiosDefaults } from 'axios';
// import axiosRetry from 'axios-retry';
// import path from 'path';
import cheerio from 'cheerio';
import { readFile, writeFile } from 'fs/promises';
import HttpsProxyAgent from 'https-proxy-agent';
import countries from './countries';
import allScrapers from '../scrapers/index';
type SearchResult = {
title: string,
@@ -26,25 +25,13 @@ export type RefreshResult = false | {
error?: boolean | string
}
interface SerplyResult {
title: string,
link: string,
realPosition: number,
}
interface SerpApiResult {
title: string,
link: string,
position: number,
}
/**
* Creates a SERP Scraper client promise based on the app settings.
* @param {KeywordType} keyword - the keyword to get the SERP for.
* @param {SettingsType} settings - the App Settings that contains the scraper details
* @returns {Promise}
*/
export const getScraperClient = (keyword:KeywordType, settings:SettingsType): Promise<AxiosResponse|Response> | false => {
export const getScraperClient = (keyword:KeywordType, settings:SettingsType, scraper?: ScraperSettings): Promise<AxiosResponse|Response> | false => {
let apiURL = ''; let client: Promise<AxiosResponse|Response> | false = false;
const headers: any = {
'Content-Type': 'application/json',
@@ -58,40 +45,27 @@ export const getScraperClient = (keyword:KeywordType, settings:SettingsType): Pr
headers['User-Agent'] = mobileAgent;
}
if (settings && settings.scraper_type === 'scrapingant' && settings.scaping_api) {
const scraperCountries = ['AE', 'BR', 'CN', 'DE', 'ES', 'FR', 'GB', 'HK', 'PL', 'IN', 'IT', 'IL', 'JP', 'NL', 'RU', 'SA', 'US', 'CZ'];
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
const lang = countries[country][2];
apiURL = `https://api.scrapingant.com/v2/extended?url=https%3A%2F%2Fwww.google.com%2Fsearch%3Fnum%3D100%26hl%3D${lang}%26q%3D${encodeURI(keyword.keyword)}&x-api-key=${settings.scaping_api}&proxy_country=${country}&browser=false`;
}
if (settings && settings.scraper_type === 'scrapingrobot' && settings.scaping_api) {
const country = keyword.country || 'US';
const lang = countries[country][2];
apiURL = `https://api.scrapingrobot.com/?token=${settings.scaping_api}&proxyCountry=${country}&render=false${keyword.device === 'mobile' ? '&mobile=true' : ''}&url=https%3A%2F%2Fwww.google.com%2Fsearch%3Fnum%3D100%26hl%3D${lang}%26q%3D${encodeURI(keyword.keyword)}`;
}
// Serply.io docs https://docs.serply.io/api
if (settings && settings.scraper_type === 'serply' && settings.scaping_api) {
const scraperCountries = ['US', 'CA', 'IE', 'GB', 'FR', 'DE', 'SE', 'IN', 'JP', 'KR', 'SG', 'AU', 'BR'];
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
if (keyword.device === 'mobile') {
headers['X-User-Agent'] = 'mobile';
} else {
headers['X-User-Agent'] = 'desktop';
if (scraper) {
// Set Scraper Header
const scrapeHeaders = scraper.headers ? scraper.headers(keyword, settings) : null;
const scraperAPIURL = scraper.scrapeURL ? scraper.scrapeURL(keyword, settings, countries) : null;
if (scrapeHeaders) {
Object.keys(scrapeHeaders).forEach((headerItemKey:string) => {
headers[headerItemKey] = scrapeHeaders[headerItemKey as keyof object];
});
}
// Set Scraper API URL
// If not URL is generated, stop right here.
if (scraperAPIURL) {
apiURL = scraperAPIURL;
} else {
return false;
}
headers['X-Proxy-Location'] = country;
headers['X-Api-Key'] = settings.scaping_api;
apiURL = `https://api.serply.io/v1/search/q=${encodeURI(keyword.keyword)}&num=100&hl=${country}`;
}
// SerpApi docs: https://serpapi.com
if (settings && settings.scraper_type === 'serpapi' && settings.scaping_api) {
apiURL = `https://serpapi.com/search?q=${encodeURI(keyword.keyword)}&num=100&gl=${keyword.country}&device=${keyword.device}&api_key=${settings.scaping_api}`;
}
if (settings && settings.scraper_type === 'proxy' && settings.proxy) {
const axiosConfig: CreateAxiosDefaults = {};
headers.Accept = 'gzip,deflate,compress;';
axiosConfig.headers = headers;
const proxies = settings.proxy.split(/\r?\n|\r|\n/g);
let proxyURL = '';
@@ -128,30 +102,37 @@ export const scrapeKeywordFromGoogle = async (keyword:KeywordType, settings:Sett
result: keyword.lastResult,
error: true,
};
const scraperClient = getScraperClient(keyword, settings);
const scraperType = settings?.scraper_type || '';
const scraperObj = allScrapers.find((scraper:ScraperSettings) => scraper.id === scraperType);
const scraperClient = getScraperClient(keyword, settings, scraperObj);
if (!scraperClient) { return false; }
let res:any = null; let scraperError:any = null;
try {
if (settings && settings.scraper_type === 'proxy' && settings.proxy) {
res = await scraperClient;
} else {
res = await scraperClient.then((result:any) => result.json());
}
if (res && (res.data || res.html || res.result || res.results || res.organic_results)) {
const extracted = extractScrapedResult(res.data || res.html || res.result || res.results || res.organic_results, settings.scraper_type);
// await writeFile('result.txt', JSON.stringify(extracted), { encoding: 'utf-8' }).catch((err) => { console.log(err); });
let scraperError:any = null;
try {
const res = scraperType === 'proxy' && settings.proxy ? await scraperClient : await scraperClient.then((reslt:any) => reslt.json());
const scraperResult = scraperObj?.resultObjectKey && res[scraperObj.resultObjectKey] ? res[scraperObj.resultObjectKey] : '';
const scrapeResult:string = (res.data || res.html || res.results || scraperResult || '');
if (res && scrapeResult) {
const extracted = scraperObj?.serpExtractor ? scraperObj.serpExtractor(scrapeResult) : extractScrapedResult(scrapeResult);
// await writeFile('result.txt', JSON.stringify(scrapeResult), { encoding: 'utf-8' }).catch((err) => { console.log(err); });
const serp = getSerp(keyword.domain, extracted);
refreshedResults = { ID: keyword.ID, keyword: keyword.keyword, position: serp.postion, url: serp.url, result: extracted, error: false };
console.log('SERP: ', keyword.keyword, serp.postion, serp.url);
console.log('[SERP]: ', keyword.keyword, serp.postion, serp.url);
} else {
scraperError = res.detail || res.error || 'Unknown Error';
throw new Error(res);
}
} catch (error:any) {
console.log('#### SCRAPE ERROR: ', keyword.keyword, '. Error: ', scraperError);
refreshedResults.error = scraperError;
if (settings.scraper_type === 'proxy' && error && error.response && error.response.statusText) {
refreshedResults.error = `[${error.response.status}] ${error.response.statusText}`;
}
console.log('[ERROR] Scraping Keyword : ', keyword.keyword, '. Error: ', error && error.response && error.response.statusText);
if (!(error && error.response && error.response.statusText)) {
console.log('[ERROR_MESSAGE]: ', error);
}
}
return refreshedResults;
@@ -160,10 +141,9 @@ export const scrapeKeywordFromGoogle = async (keyword:KeywordType, settings:Sett
/**
* Extracts the Google Search result as object array from the Google Search's HTML content
* @param {string} content - scraped google search page html data.
* @param {string} scraper_type - the type of scraper (Proxy or Scraper)
* @returns {SearchResult[]}
*/
export const extractScrapedResult = (content: string, scraper_type:string): SearchResult[] => {
export const extractScrapedResult = (content: string): SearchResult[] => {
const extractedResult = [];
const $ = cheerio.load(content);
@@ -171,57 +151,17 @@ export const extractScrapedResult = (content: string, scraper_type:string): Sear
const searchResult = hasNumberofResult.children();
let lastPosition = 0;
if (scraper_type === 'proxy') {
const mainContent = $('body').find('#main');
const children = $(mainContent).find('h3');
for (let index = 0; index < children.length; index += 1) {
const title = $(children[index]).text();
const url = $(children[index]).closest('a').attr('href');
const cleanedURL = url ? url.replace('/url?q=', '').replace(/&sa=.*/, '') : '';
for (let i = 0; i < searchResult.length; i += 1) {
if (searchResult[i]) {
const title = $(searchResult[i]).find('h3').html();
const url = $(searchResult[i]).find('a').attr('href');
// console.log(i, url?.slice(0, 40), title?.slice(0, 40));
if (title && url) {
lastPosition += 1;
extractedResult.push({ title, url: cleanedURL, position: lastPosition });
extractedResult.push({ title, url, position: lastPosition });
}
}
} else if (scraper_type === 'serply') {
// results already in json
const results: SerplyResult[] = (typeof content === 'string') ? JSON.parse(content) : content as SerplyResult[];
for (const result of results) {
if (result.title && result.link) {
extractedResult.push({
title: result.title,
url: result.link,
position: result.realPosition,
});
}
}
} else if (scraper_type === 'serpapi') {
// results already in json
const results: SerpApiResult[] = (typeof content === 'string') ? JSON.parse(content) : content as SerpApiResult[];
for (const { link, title, position } of results) {
if (title && link) {
extractedResult.push({
title,
url: link,
position,
});
}
}
} else {
for (let i = 0; i < searchResult.length; i += 1) {
if (searchResult[i]) {
const title = $(searchResult[i]).find('h3').html();
const url = $(searchResult[i]).find('a').attr('href');
// console.log(i, url?.slice(0, 40), title?.slice(0, 40));
if (title && url) {
lastPosition += 1;
extractedResult.push({ title, url, position: lastPosition });
}
}
}
}
}
return extractedResult;
};