How I Finally Beat Amazon’s Bot Detection (and Built a Powerful Web Scraper That Works!
Once your scraper function (scrapeAmazonProductPage) is ready, the next step is to wrap it inside a simple Express.js API. This allows you (or any client app) to send a request with a product URL and get structured data in return.
📦 Step 1 — Install Dependencies
If you haven’t already:
npm install express puppeteer cheerio crawler
You should now have these main dependencies:
{ "cheerio": "^1.0.0-rc.12", "crawler": "^1.5.0", "puppeteer": "^16.2.0", "express": "^4.19.2" }
Make sure your Node.js version is 20 or above for optimal Puppeteer compatibility.
🧱 Step 2 — Create Project Structure
Here’s a suggested folder layout:
amazon-scraper/ ├── package.json ├── server.js └── src/…
How I Finally Beat Amazon’s Bot Detection (and Built a Powerful Web Scraper That Works!
Once your scraper function (scrapeAmazonProductPage) is ready, the next step is to wrap it inside a simple Express.js API. This allows you (or any client app) to send a request with a product URL and get structured data in return.
📦 Step 1 — Install Dependencies
If you haven’t already:
npm install express puppeteer cheerio crawler
You should now have these main dependencies:
{ "cheerio": "^1.0.0-rc.12", "crawler": "^1.5.0", "puppeteer": "^16.2.0", "express": "^4.19.2" }
Make sure your Node.js version is 20 or above for optimal Puppeteer compatibility.
🧱 Step 2 — Create Project Structure
Here’s a suggested folder layout:
amazon-scraper/ ├── package.json ├── server.js └── src/ ├── scraper/ │ └── amazon.js └── services/ └── scrapping.js
server.js → entry point for Express
src/scraper/amazon.js → your scraper logic (the code you already have)
src/services/scrapping.js → optional, for error logging (you can mock this for now)
🧠 Step 3 — Example Mock for Error Saver
Create a dummy service in src/services/scrapping.js:
// src/services/scrapping.js
async function saveScrappingErrors(errorObj) {
console.error("Scraping error:", errorObj);
}
module.exports = { saveScrappingErrors };
🧠 Step 4 — The Scraper (amazon.js)
Use your scraper function exactly as before.
Let’s slightly clean it for API use and export it properly:
// src/scraper/amazon.js
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const Crawler = require('crawler');
const { saveScrappingErrors } = require('../services/scrapping');
const crawlPage = (url, browser) => {
return new Promise((resolve, reject) => {
const c = new Crawler({
maxConnections: 100000,
skipDuplicates: true,
callback: async (error, res, done) => {
if (error) return reject(error);
try {
const $ = cheerio.load(res.body);
if (!$('#histogramTable').length) return resolve(await crawlPage(url, browser));
const reviews = [];
const reviewElements = $('.a-section.review[data-hook="review"]');
const review_rating = $('[data-hook="average-star-rating"]').text().trim();
const review_count = $('[data-hook="total-review-count"]').text().trim().split(' ')[0];
const name = $('#productTitle').text().trim();
const description = $('#feature-bullets .a-list-item').text().trim();
const product_author = $('#bylineInfo').text().trim();
const regex = /\b\d+(\.\d+)?\b/;
reviewElements.each((_, el) => {
const author = $(el).find('.a-profile-name').text().trim();
const content = $(el).find('.review-text').text().trim();
const title = $(el).find('[data-hook="review-title"]').text().trim();
const date = $(el).find('[data-hook="review-date"]').text().trim();
let stars = $(el).find('.review-rating span').text().trim();
const match = stars.match(regex);
stars = match ? parseFloat(match[0]) : '';
reviews.push({ author, content, title, date, rating: stars });
});
const extractStars = () => {
const starsPercentageArray = [];
$('#histogramTable .a-histogram-row').each((_, el) => {
const percentageText = $(el).find('.a-text-right a').text();
const percentage = parseInt(percentageText.replace('%', ''), 10);
const starsText = $(el).find('a.a-size-base').text();
const number_of_stars = parseInt(starsText, 10);
starsPercentageArray.push({ percentage: percentage || 0, number_of_stars });
});
return starsPercentageArray;
};
const extractMainImage = () => $('#imgTagWrapperId img').attr('src') || '';
const core_price = $('#corePriceDisplay_desktop_feature_div .a-section .aok-offscreen').text().trim();
const currencyPattern = /\$\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?/;
const match = core_price.match(currencyPattern);
const extractedCurrency = match ? match[0] : "";
const extractImages = async () => {
const htmlContent = res.body;
const page = await browser.newPage();
await page.setContent(htmlContent, { waitUntil: 'load', timeout: 0 });
const thumbnails = await page.$$('#altImages ul .imageThumbnail');
for (const thumbnail of thumbnails) {
await page.evaluate(el => el instanceof HTMLElement && el.scrollIntoView(), thumbnail);
await thumbnail.hover();
}
await page.waitForTimeout(1000);
const productData = await page.evaluate(() => {
const images = [];
document.querySelectorAll('.a-unordered-list .image .imgTagWrapper img').forEach(img => {
if (img && img.src && !img.src.endsWith('.svg')) images.push(img.src);
});
return images;
});
return productData;
};
const images_data = await extractImages();
resolve({
websiteName: 'Amazon',
reviews,
product_images_links: images_data,
review_rating,
review_count,
price: extractedCurrency,
name,
description,
product_author,
stars: extractStars(),
image_url: extractMainImage(),
});
} catch (err) {
reject(err);
} finally {
done();
}
},
});
c.queue(url);
});
};
async function scrapeAmazonProductPage(homeUrl) {
const browser = await puppeteer.launch({
headless: true,
ignoreHTTPSErrors: true,
args: [
"--disable-gpu",
"--disable-dev-shm-usage",
"--disable-setuid-sandbox",
"--no-sandbox",
],
});
try {
const data = await crawlPage(homeUrl, browser);
return data;
} catch (e) {
await saveScrappingErrors({ error: e.message || e, url: homeUrl });
return null;
} finally {
await browser.close();
}
}
module.exports = { scrapeAmazonProductPage };
⚡ Step 5 — Create Express API
Now create server.js in the root:
// server.js
const express = require('express');
const cors = require('cors');
const { scrapeAmazonProductPage } = require('./src/scraper/amazon');
const app = express();
app.use(express.json());
app.use(cors());
// Health check
app.get('/', (req, res) => {
res.send('✅ Amazon Scraper API is running...');
});
// Main API endpoint
app.post('/api/scrape', async (req, res) => {
const { url } = req.body;
if (!url || !url.includes('amazon')) {
return res.status(400).json({ error: 'Invalid or missing Amazon URL' });
}
try {
const data = await scrapeAmazonProductPage(url);
if (!data) {
return res.status(500).json({ error: 'Failed to scrape product data' });
}
res.json(data);
} catch (error) {
console.error('Scrape failed:', error);
res.status(500).json({ error: error.message || 'Unexpected error' });
}
});
const PORT = process.env.PORT || 4000;
app.listen(PORT, () => console.log(`🚀 Server running on port ${PORT}`));
🧪 Step 6 — Test the API
Run the server:
node server.js
Then use Postman, curl, or any HTTP client to test:
Request: POST http://localhost:4000/api/scrape
Content-Type: application/json { “url”: “https://www.amazon.com/dp/B0BP9Z7K5V” } Response: { “websiteName”: “Amazon”, “name”: “Apple AirPods (3rd Generation)”, “price”: “$169.99”, “review_rating”: “4.7 out of 5 stars”, “review_count”: “145,201”, “description”: “Spatial Audio with dynamic head tracking...”, “product_author”: “Apple”, “stars”: [ { “number_of_stars”: 5, “percentage”: 85 }, { “number_of_stars”: 4, “percentage”: 10 } ], “product_images_links”: [ “https://m.media-amazon.com/images/I/61ZRU9gnbxL._AC_SL1500_.jpg”, “https://m.media-amazon.com/images/I/61dw1VHfwbL._AC_SL1500_.jpg” ] } ⚙️ Step 7 — Tips for Production
✅ Use rate limiting to avoid Amazon blocking. ✅ Deploy behind a proxy or rotating IP system if scraping frequently. ✅ Consider puppeteer-extra-plugin-stealth for better evasion. ✅ Cache results in a database if you’ll reuse them often.