mirror of
https://github.com/towfiqi/serpbear
synced 2025-06-26 18:15:54 +00:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0c8b457eee | ||
|
|
123ad81dae | ||
|
|
6c48b87e5e | ||
|
|
cf8b2c6913 | ||
|
|
6d6e2f63d0 | ||
|
|
74c9603293 |
@@ -2,6 +2,15 @@
|
||||
|
||||
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
|
||||
|
||||
### [0.2.2](https://github.com/towfiqi/serpbear/compare/v0.2.1...v0.2.2) (2022-12-25)
|
||||
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
* Fixes bug that prevents Saving API settings ([123ad81](https://github.com/towfiqi/serpbear/commit/123ad81dae10aa28848148d0f3da5cf1f7de7c57)), closes [#45](https://github.com/towfiqi/serpbear/issues/45)
|
||||
|
||||
### [0.2.1](https://github.com/towfiqi/serpbear/compare/v0.2.0...v0.2.1) (2022-12-24)
|
||||
|
||||
## [0.2.0](https://github.com/towfiqi/serpbear/compare/v0.1.7...v0.2.0) (2022-12-21)
|
||||
|
||||
|
||||
|
||||
@@ -56,6 +56,9 @@ const Keyword = (props: KeywordProps) => {
|
||||
const historySorted = historyArray.sort((a, b) => a.date - b.date);
|
||||
const previousPos = historySorted[historySorted.length - 2].position;
|
||||
status = previousPos === 0 ? position : previousPos - position;
|
||||
if (position === 0 && previousPos > 0) {
|
||||
status = previousPos - 100;
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}, [history, position]);
|
||||
|
||||
@@ -63,21 +63,20 @@ const Settings = ({ closeSettings }:SettingsProps) => {
|
||||
|
||||
const performUpdate = () => {
|
||||
let error: null|SettingsError = null;
|
||||
if (settings.notification_interval !== 'never') {
|
||||
const { notification_interval, notification_email, notification_email_from, scraper_type, smtp_port, smtp_server, scaping_api } = settings;
|
||||
if (notification_interval !== 'never') {
|
||||
if (!settings.notification_email) {
|
||||
error = { type: 'no_email', msg: 'Insert a Valid Email address' };
|
||||
}
|
||||
if (settings.notification_email
|
||||
&& (!settings.smtp_port || !settings.smtp_server
|
||||
|| !settings.notification_email_from)) {
|
||||
if (notification_email && (!smtp_port || !smtp_server || !notification_email_from)) {
|
||||
let type = 'no_smtp_from';
|
||||
if (!settings.smtp_port) { type = 'no_smtp_port'; }
|
||||
if (!settings.smtp_server) { type = 'no_smtp_server'; }
|
||||
if (!smtp_port) { type = 'no_smtp_port'; }
|
||||
if (!smtp_server) { type = 'no_smtp_server'; }
|
||||
error = { type, msg: 'Insert SMTP Server details that will be used to send the emails.' };
|
||||
}
|
||||
}
|
||||
|
||||
if (['scrapingant', 'scrapingrobot', 'serply', 'serpapi'].includes(settings.scraper_type) && !settings.scaping_api) {
|
||||
if (scraper_type !== 'proxy' && scraper_type !== 'none' && !scaping_api) {
|
||||
error = { type: 'no_api_key', msg: 'Insert a Valid API Key or Token for the Scraper Service.' };
|
||||
}
|
||||
|
||||
@@ -98,14 +97,8 @@ const Settings = ({ closeSettings }:SettingsProps) => {
|
||||
{ label: 'Monthly', value: 'monthly' },
|
||||
{ label: 'Never', value: 'never' },
|
||||
];
|
||||
const scraperOptions: SelectionOption[] = [
|
||||
{ label: 'None', value: 'none' },
|
||||
{ label: 'Proxy', value: 'proxy' },
|
||||
{ label: 'ScrapingAnt.com', value: 'scrapingant' },
|
||||
{ label: 'ScrapingRobot.com', value: 'scrapingrobot' },
|
||||
{ label: 'serply.io', value: 'serply' },
|
||||
{ label: 'serpapi.com', value: 'serpapi' },
|
||||
];
|
||||
const allScrapers: SelectionOption[] = settings.available_scapers ? settings.available_scapers : [];
|
||||
const scraperOptions: SelectionOption[] = [{ label: 'None', value: 'none' }, ...allScrapers];
|
||||
|
||||
const tabStyle = 'inline-block px-4 py-1 rounded-full mr-3 cursor-pointer text-sm';
|
||||
return (
|
||||
|
||||
4
package-lock.json
generated
4
package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "serpbear",
|
||||
"version": "0.2.0",
|
||||
"version": "0.2.2",
|
||||
"lockfileVersion": 2,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "serpbear",
|
||||
"version": "0.2.0",
|
||||
"version": "0.2.2",
|
||||
"dependencies": {
|
||||
"@googleapis/searchconsole": "^1.0.0",
|
||||
"@testing-library/react": "^13.4.0",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "serpbear",
|
||||
"version": "0.2.0",
|
||||
"version": "0.2.2",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"dev": "next dev",
|
||||
|
||||
@@ -2,6 +2,7 @@ import type { NextApiRequest, NextApiResponse } from 'next';
|
||||
import Cryptr from 'cryptr';
|
||||
import { writeFile, readFile } from 'fs/promises';
|
||||
import verifyUser from '../../utils/verifyUser';
|
||||
import allScrapers from '../../scrapers/index';
|
||||
|
||||
type SettingsGetResponse = {
|
||||
settings?: object | null,
|
||||
@@ -65,6 +66,7 @@ export const getAppSettings = async () : Promise<SettingsType> => {
|
||||
scaping_api,
|
||||
smtp_password,
|
||||
search_console_integrated: !!(process.env.SEARCH_CONSOLE_PRIVATE_KEY && process.env.SEARCH_CONSOLE_CLIENT_EMAIL),
|
||||
available_scapers: allScrapers.map((scraper) => ({ label: scraper.name, value: scraper.id })),
|
||||
};
|
||||
} catch (error) {
|
||||
console.log('Error Decrypting Settings API Keys!');
|
||||
|
||||
13
scrapers/index.ts
Normal file
13
scrapers/index.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
import scrapingAnt from './services/scrapingant';
|
||||
import scrapingRobot from './services/scrapingrobot';
|
||||
import serpapi from './services/serpapi';
|
||||
import serply from './services/serply';
|
||||
import proxy from './services/proxy';
|
||||
|
||||
export default [
|
||||
scrapingRobot,
|
||||
scrapingAnt,
|
||||
serpapi,
|
||||
serply,
|
||||
proxy,
|
||||
];
|
||||
35
scrapers/services/proxy.ts
Normal file
35
scrapers/services/proxy.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
const proxy:ScraperSettings = {
|
||||
id: 'proxy',
|
||||
name: 'Proxy',
|
||||
website: '',
|
||||
resultObjectKey: 'data',
|
||||
headers: () => {
|
||||
return { Accept: 'gzip,deflate,compress;' };
|
||||
},
|
||||
scrapeURL: (keyword) => {
|
||||
return `https://www.google.com/search?num=100&q=${encodeURI(keyword.keyword)}`;
|
||||
},
|
||||
serpExtractor: (content) => {
|
||||
const extractedResult = [];
|
||||
|
||||
const $ = cheerio.load(content);
|
||||
let lastPosition = 0;
|
||||
const mainContent = $('body').find('#main');
|
||||
const children = $(mainContent).find('h3');
|
||||
|
||||
for (let index = 0; index < children.length; index += 1) {
|
||||
const title = $(children[index]).text();
|
||||
const url = $(children[index]).closest('a').attr('href');
|
||||
const cleanedURL = url ? url.replaceAll(/^.+?(?=https:|$)/g, '').replaceAll(/(&).*/g, '') : '';
|
||||
if (title && url) {
|
||||
lastPosition += 1;
|
||||
extractedResult.push({ title, url: cleanedURL, position: lastPosition });
|
||||
}
|
||||
}
|
||||
return extractedResult;
|
||||
},
|
||||
};
|
||||
|
||||
export default proxy;
|
||||
14
scrapers/services/scrapingant.ts
Normal file
14
scrapers/services/scrapingant.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
const scrapingAnt:ScraperSettings = {
|
||||
id: 'scrapingant',
|
||||
name: 'ScrapingAnt',
|
||||
website: 'scrapingant.com',
|
||||
scrapeURL: (keyword, settings, countryData) => {
|
||||
const scraperCountries = ['AE', 'BR', 'CN', 'DE', 'ES', 'FR', 'GB', 'HK', 'PL', 'IN', 'IT', 'IL', 'JP', 'NL', 'RU', 'SA', 'US', 'CZ'];
|
||||
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
|
||||
const lang = countryData[country][2];
|
||||
return `https://api.scrapingant.com/v2/extended?url=https%3A%2F%2Fwww.google.com%2Fsearch%3Fnum%3D100%26hl%3D${lang}%26q%3D${encodeURI(keyword.keyword)}&x-api-key=${settings.scaping_api}&proxy_country=${country}&browser=false`;
|
||||
},
|
||||
resultObjectKey: 'result',
|
||||
};
|
||||
|
||||
export default scrapingAnt;
|
||||
13
scrapers/services/scrapingrobot.ts
Normal file
13
scrapers/services/scrapingrobot.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
const scrapingRobot:ScraperSettings = {
|
||||
id: 'scrapingrobot',
|
||||
name: 'Scraping Robot',
|
||||
website: 'scrapingrobot.com',
|
||||
scrapeURL: (keyword, settings, countryData) => {
|
||||
const country = keyword.country || 'US';
|
||||
const lang = countryData[country][2];
|
||||
return `https://api.scrapingrobot.com/?token=${settings.scaping_api}&proxyCountry=${country}&render=false${keyword.device === 'mobile' ? '&mobile=true' : ''}&url=https%3A%2F%2Fwww.google.com%2Fsearch%3Fnum%3D100%26hl%3D${lang}%26q%3D${encodeURI(keyword.keyword)}`;
|
||||
},
|
||||
resultObjectKey: 'result',
|
||||
};
|
||||
|
||||
export default scrapingRobot;
|
||||
38
scrapers/services/serpapi.ts
Normal file
38
scrapers/services/serpapi.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
interface SerpApiResult {
|
||||
title: string,
|
||||
link: string,
|
||||
position: number,
|
||||
}
|
||||
|
||||
const serpapi:ScraperSettings = {
|
||||
id: 'serpapi',
|
||||
name: 'SerpApi.com',
|
||||
website: 'serpapi.com',
|
||||
headers: (keyword, settings) => {
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'X-API-Key': settings.scaping_api,
|
||||
};
|
||||
},
|
||||
scrapeURL: (keyword, settings) => {
|
||||
return `https://serpapi.com/search?q=${encodeURI(keyword.keyword)}&num=100&gl=${keyword.country}&device=${keyword.device}&api_key=${settings.scaping_api}`;
|
||||
},
|
||||
resultObjectKey: 'organic_results',
|
||||
serpExtractor: (content) => {
|
||||
const extractedResult = [];
|
||||
const results: SerpApiResult[] = (typeof content === 'string') ? JSON.parse(content) : content as SerpApiResult[];
|
||||
|
||||
for (const { link, title, position } of results) {
|
||||
if (title && link) {
|
||||
extractedResult.push({
|
||||
title,
|
||||
url: link,
|
||||
position,
|
||||
});
|
||||
}
|
||||
}
|
||||
return extractedResult;
|
||||
},
|
||||
};
|
||||
|
||||
export default serpapi;
|
||||
42
scrapers/services/serply.ts
Normal file
42
scrapers/services/serply.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
interface SerplyResult {
|
||||
title: string,
|
||||
link: string,
|
||||
realPosition: number,
|
||||
}
|
||||
const scraperCountries = ['US', 'CA', 'IE', 'GB', 'FR', 'DE', 'SE', 'IN', 'JP', 'KR', 'SG', 'AU', 'BR'];
|
||||
|
||||
const serply:ScraperSettings = {
|
||||
id: 'serply',
|
||||
name: 'Serply',
|
||||
website: 'serply.io',
|
||||
headers: (keyword, settings) => {
|
||||
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'X-User-Agent': keyword.device === 'mobile' ? 'mobile' : 'desktop',
|
||||
'X-Api-Key': settings.scaping_api,
|
||||
'X-Proxy-Location': country,
|
||||
};
|
||||
},
|
||||
scrapeURL: (keyword) => {
|
||||
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
|
||||
return `https://api.serply.io/v1/search/q=${encodeURI(keyword.keyword)}&num=100&hl=${country}`;
|
||||
},
|
||||
resultObjectKey: 'result',
|
||||
serpExtractor: (content) => {
|
||||
const extractedResult = [];
|
||||
const results: SerplyResult[] = (typeof content === 'string') ? JSON.parse(content) : content as SerplyResult[];
|
||||
for (const result of results) {
|
||||
if (result.title && result.link) {
|
||||
extractedResult.push({
|
||||
title: result.title,
|
||||
url: result.link,
|
||||
position: result.realPosition,
|
||||
});
|
||||
}
|
||||
}
|
||||
return extractedResult;
|
||||
},
|
||||
};
|
||||
|
||||
export default serply;
|
||||
18
types.d.ts
vendored
18
types.d.ts
vendored
@@ -54,7 +54,7 @@ type KeywordFilters = {
|
||||
}
|
||||
|
||||
type countryData = {
|
||||
[ISO:string] : string[]
|
||||
[ISO:string] : [countryName:string, cityName:string, language:string]
|
||||
}
|
||||
|
||||
type countryCodeData = {
|
||||
@@ -78,6 +78,7 @@ type SettingsType = {
|
||||
smtp_username?: string,
|
||||
smtp_password?: string,
|
||||
search_console_integrated?: boolean,
|
||||
available_scapers?: Array
|
||||
}
|
||||
|
||||
type KeywordSCDataChild = {
|
||||
@@ -163,3 +164,18 @@ type SCDomainDataType = {
|
||||
}
|
||||
|
||||
type SCKeywordType = SearchAnalyticsItem;
|
||||
|
||||
type scraperExtractedItem = {
|
||||
title: string,
|
||||
url: string,
|
||||
position: number,
|
||||
}
|
||||
interface ScraperSettings {
|
||||
id:string,
|
||||
name:string,
|
||||
website:string,
|
||||
resultObjectKey: string,
|
||||
headers?(keyword:KeywordType, settings: SettingsType): Object,
|
||||
scrapeURL?(keyword:KeywordType, settings:SettingsType, countries:countryData): string,
|
||||
serpExtractor?(content:string): scraperExtractedItem[],
|
||||
}
|
||||
|
||||
@@ -60,7 +60,10 @@ const getPositionChange = (history:KeywordHistory, position:number) : number =>
|
||||
}));
|
||||
const historySorted = historyArray.sort((a, b) => a.date - b.date);
|
||||
const previousPos = historySorted[historySorted.length - 2].position;
|
||||
status = previousPos - position;
|
||||
status = previousPos === 0 ? position : previousPos - position;
|
||||
if (position === 0 && previousPos > 0) {
|
||||
status = previousPos - 100;
|
||||
}
|
||||
}
|
||||
return status;
|
||||
};
|
||||
|
||||
152
utils/scraper.ts
152
utils/scraper.ts
@@ -1,10 +1,9 @@
|
||||
import axios, { AxiosResponse, CreateAxiosDefaults } from 'axios';
|
||||
// import axiosRetry from 'axios-retry';
|
||||
// import path from 'path';
|
||||
import cheerio from 'cheerio';
|
||||
import { readFile, writeFile } from 'fs/promises';
|
||||
import HttpsProxyAgent from 'https-proxy-agent';
|
||||
import countries from './countries';
|
||||
import allScrapers from '../scrapers/index';
|
||||
|
||||
type SearchResult = {
|
||||
title: string,
|
||||
@@ -26,25 +25,13 @@ export type RefreshResult = false | {
|
||||
error?: boolean | string
|
||||
}
|
||||
|
||||
interface SerplyResult {
|
||||
title: string,
|
||||
link: string,
|
||||
realPosition: number,
|
||||
}
|
||||
|
||||
interface SerpApiResult {
|
||||
title: string,
|
||||
link: string,
|
||||
position: number,
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a SERP Scraper client promise based on the app settings.
|
||||
* @param {KeywordType} keyword - the keyword to get the SERP for.
|
||||
* @param {SettingsType} settings - the App Settings that contains the scraper details
|
||||
* @returns {Promise}
|
||||
*/
|
||||
export const getScraperClient = (keyword:KeywordType, settings:SettingsType): Promise<AxiosResponse|Response> | false => {
|
||||
export const getScraperClient = (keyword:KeywordType, settings:SettingsType, scraper?: ScraperSettings): Promise<AxiosResponse|Response> | false => {
|
||||
let apiURL = ''; let client: Promise<AxiosResponse|Response> | false = false;
|
||||
const headers: any = {
|
||||
'Content-Type': 'application/json',
|
||||
@@ -58,40 +45,27 @@ export const getScraperClient = (keyword:KeywordType, settings:SettingsType): Pr
|
||||
headers['User-Agent'] = mobileAgent;
|
||||
}
|
||||
|
||||
if (settings && settings.scraper_type === 'scrapingant' && settings.scaping_api) {
|
||||
const scraperCountries = ['AE', 'BR', 'CN', 'DE', 'ES', 'FR', 'GB', 'HK', 'PL', 'IN', 'IT', 'IL', 'JP', 'NL', 'RU', 'SA', 'US', 'CZ'];
|
||||
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
|
||||
const lang = countries[country][2];
|
||||
apiURL = `https://api.scrapingant.com/v2/extended?url=https%3A%2F%2Fwww.google.com%2Fsearch%3Fnum%3D100%26hl%3D${lang}%26q%3D${encodeURI(keyword.keyword)}&x-api-key=${settings.scaping_api}&proxy_country=${country}&browser=false`;
|
||||
}
|
||||
|
||||
if (settings && settings.scraper_type === 'scrapingrobot' && settings.scaping_api) {
|
||||
const country = keyword.country || 'US';
|
||||
const lang = countries[country][2];
|
||||
apiURL = `https://api.scrapingrobot.com/?token=${settings.scaping_api}&proxyCountry=${country}&render=false${keyword.device === 'mobile' ? '&mobile=true' : ''}&url=https%3A%2F%2Fwww.google.com%2Fsearch%3Fnum%3D100%26hl%3D${lang}%26q%3D${encodeURI(keyword.keyword)}`;
|
||||
}
|
||||
|
||||
// Serply.io docs https://docs.serply.io/api
|
||||
if (settings && settings.scraper_type === 'serply' && settings.scaping_api) {
|
||||
const scraperCountries = ['US', 'CA', 'IE', 'GB', 'FR', 'DE', 'SE', 'IN', 'JP', 'KR', 'SG', 'AU', 'BR'];
|
||||
const country = scraperCountries.includes(keyword.country.toUpperCase()) ? keyword.country : 'US';
|
||||
if (keyword.device === 'mobile') {
|
||||
headers['X-User-Agent'] = 'mobile';
|
||||
} else {
|
||||
headers['X-User-Agent'] = 'desktop';
|
||||
if (scraper) {
|
||||
// Set Scraper Header
|
||||
const scrapeHeaders = scraper.headers ? scraper.headers(keyword, settings) : null;
|
||||
const scraperAPIURL = scraper.scrapeURL ? scraper.scrapeURL(keyword, settings, countries) : null;
|
||||
if (scrapeHeaders) {
|
||||
Object.keys(scrapeHeaders).forEach((headerItemKey:string) => {
|
||||
headers[headerItemKey] = scrapeHeaders[headerItemKey as keyof object];
|
||||
});
|
||||
}
|
||||
// Set Scraper API URL
|
||||
// If not URL is generated, stop right here.
|
||||
if (scraperAPIURL) {
|
||||
apiURL = scraperAPIURL;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
headers['X-Proxy-Location'] = country;
|
||||
headers['X-Api-Key'] = settings.scaping_api;
|
||||
apiURL = `https://api.serply.io/v1/search/q=${encodeURI(keyword.keyword)}&num=100&hl=${country}`;
|
||||
}
|
||||
|
||||
// SerpApi docs: https://serpapi.com
|
||||
if (settings && settings.scraper_type === 'serpapi' && settings.scaping_api) {
|
||||
apiURL = `https://serpapi.com/search?q=${encodeURI(keyword.keyword)}&num=100&gl=${keyword.country}&device=${keyword.device}&api_key=${settings.scaping_api}`;
|
||||
}
|
||||
|
||||
if (settings && settings.scraper_type === 'proxy' && settings.proxy) {
|
||||
const axiosConfig: CreateAxiosDefaults = {};
|
||||
headers.Accept = 'gzip,deflate,compress;';
|
||||
axiosConfig.headers = headers;
|
||||
const proxies = settings.proxy.split(/\r?\n|\r|\n/g);
|
||||
let proxyURL = '';
|
||||
@@ -128,30 +102,37 @@ export const scrapeKeywordFromGoogle = async (keyword:KeywordType, settings:Sett
|
||||
result: keyword.lastResult,
|
||||
error: true,
|
||||
};
|
||||
const scraperClient = getScraperClient(keyword, settings);
|
||||
const scraperType = settings?.scraper_type || '';
|
||||
const scraperObj = allScrapers.find((scraper:ScraperSettings) => scraper.id === scraperType);
|
||||
const scraperClient = getScraperClient(keyword, settings, scraperObj);
|
||||
|
||||
if (!scraperClient) { return false; }
|
||||
let res:any = null; let scraperError:any = null;
|
||||
try {
|
||||
if (settings && settings.scraper_type === 'proxy' && settings.proxy) {
|
||||
res = await scraperClient;
|
||||
} else {
|
||||
res = await scraperClient.then((result:any) => result.json());
|
||||
}
|
||||
|
||||
if (res && (res.data || res.html || res.result || res.results || res.organic_results)) {
|
||||
const extracted = extractScrapedResult(res.data || res.html || res.result || res.results || res.organic_results, settings.scraper_type);
|
||||
// await writeFile('result.txt', JSON.stringify(extracted), { encoding: 'utf-8' }).catch((err) => { console.log(err); });
|
||||
let scraperError:any = null;
|
||||
try {
|
||||
const res = scraperType === 'proxy' && settings.proxy ? await scraperClient : await scraperClient.then((reslt:any) => reslt.json());
|
||||
const scraperResult = scraperObj?.resultObjectKey && res[scraperObj.resultObjectKey] ? res[scraperObj.resultObjectKey] : '';
|
||||
const scrapeResult:string = (res.data || res.html || res.results || scraperResult || '');
|
||||
if (res && scrapeResult) {
|
||||
const extracted = scraperObj?.serpExtractor ? scraperObj.serpExtractor(scrapeResult) : extractScrapedResult(scrapeResult);
|
||||
// await writeFile('result.txt', JSON.stringify(scrapeResult), { encoding: 'utf-8' }).catch((err) => { console.log(err); });
|
||||
const serp = getSerp(keyword.domain, extracted);
|
||||
refreshedResults = { ID: keyword.ID, keyword: keyword.keyword, position: serp.postion, url: serp.url, result: extracted, error: false };
|
||||
console.log('SERP: ', keyword.keyword, serp.postion, serp.url);
|
||||
console.log('[SERP]: ', keyword.keyword, serp.postion, serp.url);
|
||||
} else {
|
||||
scraperError = res.detail || res.error || 'Unknown Error';
|
||||
throw new Error(res);
|
||||
}
|
||||
} catch (error:any) {
|
||||
console.log('#### SCRAPE ERROR: ', keyword.keyword, '. Error: ', scraperError);
|
||||
refreshedResults.error = scraperError;
|
||||
if (settings.scraper_type === 'proxy' && error && error.response && error.response.statusText) {
|
||||
refreshedResults.error = `[${error.response.status}] ${error.response.statusText}`;
|
||||
}
|
||||
|
||||
console.log('[ERROR] Scraping Keyword : ', keyword.keyword, '. Error: ', error && error.response && error.response.statusText);
|
||||
if (!(error && error.response && error.response.statusText)) {
|
||||
console.log('[ERROR_MESSAGE]: ', error);
|
||||
}
|
||||
}
|
||||
|
||||
return refreshedResults;
|
||||
@@ -160,10 +141,9 @@ export const scrapeKeywordFromGoogle = async (keyword:KeywordType, settings:Sett
|
||||
/**
|
||||
* Extracts the Google Search result as object array from the Google Search's HTML content
|
||||
* @param {string} content - scraped google search page html data.
|
||||
* @param {string} scraper_type - the type of scraper (Proxy or Scraper)
|
||||
* @returns {SearchResult[]}
|
||||
*/
|
||||
export const extractScrapedResult = (content: string, scraper_type:string): SearchResult[] => {
|
||||
export const extractScrapedResult = (content: string): SearchResult[] => {
|
||||
const extractedResult = [];
|
||||
|
||||
const $ = cheerio.load(content);
|
||||
@@ -171,57 +151,17 @@ export const extractScrapedResult = (content: string, scraper_type:string): Sear
|
||||
const searchResult = hasNumberofResult.children();
|
||||
let lastPosition = 0;
|
||||
|
||||
if (scraper_type === 'proxy') {
|
||||
const mainContent = $('body').find('#main');
|
||||
const children = $(mainContent).find('h3');
|
||||
|
||||
for (let index = 0; index < children.length; index += 1) {
|
||||
const title = $(children[index]).text();
|
||||
const url = $(children[index]).closest('a').attr('href');
|
||||
const cleanedURL = url ? url.replace('/url?q=', '').replace(/&sa=.*/, '') : '';
|
||||
for (let i = 0; i < searchResult.length; i += 1) {
|
||||
if (searchResult[i]) {
|
||||
const title = $(searchResult[i]).find('h3').html();
|
||||
const url = $(searchResult[i]).find('a').attr('href');
|
||||
// console.log(i, url?.slice(0, 40), title?.slice(0, 40));
|
||||
if (title && url) {
|
||||
lastPosition += 1;
|
||||
extractedResult.push({ title, url: cleanedURL, position: lastPosition });
|
||||
extractedResult.push({ title, url, position: lastPosition });
|
||||
}
|
||||
}
|
||||
} else if (scraper_type === 'serply') {
|
||||
// results already in json
|
||||
const results: SerplyResult[] = (typeof content === 'string') ? JSON.parse(content) : content as SerplyResult[];
|
||||
for (const result of results) {
|
||||
if (result.title && result.link) {
|
||||
extractedResult.push({
|
||||
title: result.title,
|
||||
url: result.link,
|
||||
position: result.realPosition,
|
||||
});
|
||||
}
|
||||
}
|
||||
} else if (scraper_type === 'serpapi') {
|
||||
// results already in json
|
||||
const results: SerpApiResult[] = (typeof content === 'string') ? JSON.parse(content) : content as SerpApiResult[];
|
||||
|
||||
for (const { link, title, position } of results) {
|
||||
if (title && link) {
|
||||
extractedResult.push({
|
||||
title,
|
||||
url: link,
|
||||
position,
|
||||
});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (let i = 0; i < searchResult.length; i += 1) {
|
||||
if (searchResult[i]) {
|
||||
const title = $(searchResult[i]).find('h3').html();
|
||||
const url = $(searchResult[i]).find('a').attr('href');
|
||||
// console.log(i, url?.slice(0, 40), title?.slice(0, 40));
|
||||
if (title && url) {
|
||||
lastPosition += 1;
|
||||
extractedResult.push({ title, url, position: lastPosition });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return extractedResult;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user