From 6d6e2f63d09554bc533be212c0ed3c05aaf7a2a4 Mon Sep 17 00:00:00 2001 From: Towfiq Date: Sat, 24 Dec 2022 08:29:44 +0600 Subject: [PATCH] fixes: Broken Proxy Scraper fixes #33 --- utils/scraper.ts | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/utils/scraper.ts b/utils/scraper.ts index 24df4b9..c09782c 100644 --- a/utils/scraper.ts +++ b/utils/scraper.ts @@ -92,6 +92,7 @@ export const getScraperClient = (keyword:KeywordType, settings:SettingsType): Pr if (settings && settings.scraper_type === 'proxy' && settings.proxy) { const axiosConfig: CreateAxiosDefaults = {}; + headers.Accept = 'gzip,deflate,compress;'; axiosConfig.headers = headers; const proxies = settings.proxy.split(/\r?\n|\r|\n/g); let proxyURL = ''; @@ -139,9 +140,10 @@ export const scrapeKeywordFromGoogle = async (keyword:KeywordType, settings:Sett res = await scraperClient.then((result:any) => result.json()); } - if (res && (res.data || res.html || res.result || res.results || res.organic_results)) { - const extracted = extractScrapedResult(res.data || res.html || res.result || res.results || res.organic_results, settings.scraper_type); - // await writeFile('result.txt', JSON.stringify(extracted), { encoding: 'utf-8' }).catch((err) => { console.log(err); }); + const scrapeResult = (res.data || res.html || res.result || res.results || res.organic_results || ''); + if (res && scrapeResult) { + const extracted = extractScrapedResult(scrapeResult, settings.scraper_type); + // await writeFile('result.txt', JSON.stringify(scrapeResult), { encoding: 'utf-8' }).catch((err) => { console.log(err); }); const serp = getSerp(keyword.domain, extracted); refreshedResults = { ID: keyword.ID, keyword: keyword.keyword, position: serp.postion, url: serp.url, result: extracted, error: false }; console.log('SERP: ', keyword.keyword, serp.postion, serp.url); @@ -150,8 +152,11 @@ export const scrapeKeywordFromGoogle = async (keyword:KeywordType, settings:Sett throw new Error(res); } } catch (error:any) { - console.log('#### SCRAPE ERROR: ', keyword.keyword, '. Error: ', scraperError); + console.log('[ERROR] Scraping Keyword : ', keyword.keyword, '. Error: ', error && error.response && error.response.statusText); refreshedResults.error = scraperError; + if (settings.scraper_type === 'proxy' && error && error.response && error.response.statusText) { + refreshedResults.error = `[${error.response.status}] ${error.response.statusText}`; + } } return refreshedResults; @@ -178,7 +183,7 @@ export const extractScrapedResult = (content: string, scraper_type:string): Sear for (let index = 0; index < children.length; index += 1) { const title = $(children[index]).text(); const url = $(children[index]).closest('a').attr('href'); - const cleanedURL = url ? url.replace('/url?q=', '').replace(/&sa=.*/, '') : ''; + const cleanedURL = url ? url.replaceAll(/^.+?(?=https:|$)/g, '').replaceAll(/(&).*/g, '') : ''; if (title && url) { lastPosition += 1; extractedResult.push({ title, url: cleanedURL, position: lastPosition });