import axios from 'axios';
import * as cheerio from 'cheerio';
import puppeteer from "puppeteer-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import path from 'path';
import fs from 'fs';
// Use the StealthPlugin to avoid detection by websites
puppeteer.use(StealthPlugin());
// Configuration parameters for the request
const requestParams = {
baseURL: `http://google.com`, // Base URL for Google Maps
query: "Sarafgarh+Dam,+5QM6%2B2F6,+Odisha+770012", // Query string for the search
coordinates: "", // GPS coordinates for the search (if needed)
hl: "en", // Language parameter for the search
};
// Utility function to introduce a delay
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
/**
* Clicks an element on the page identified by the given selector.
* @param {Page} page - The Puppeteer page object.
* @param {string} selector - The CSS selector of the element to click.
*/
async function clickElement(page, selector) {
// Wait for the element to be visible
await page.waitForSelector(selector, { visible: true });
// Click the element
await page.click(selector);
}
/**
* Interacts with the page by clicking the "All" button and waits for the content to load.
* @param {Page} page - The Puppeteer page object.
*/
async function interactWithPageAndTakeScreenshot(page) {
// Click the button with aria-label="All"
await page.waitForSelector('button[aria-label="All"]', { visible: true });
await page.click('button[aria-label="All"]');
// Wait for any additional loading after clicking the button
await delay(2000);
}
/**
* Downloads an image from the given URL and saves it to the specified file path.
* @param {string} url - The URL of the image to download.
* @param {string} filePath - The path where the image will be saved.
* @returns {Promise} - A promise that resolves when the image is downloaded.
*/
async function downloadImage(url, filePath) {
const response = await axios({
url,
responseType: 'stream',
});
response.data.pipe(fs.createWriteStream(filePath));
return new Promise((resolve, reject) => {
response.data.on('end', () => resolve());
response.data.on('error', (err) => reject(err));
});
}
/**
* Scrolls a container on the page a specified number of times.
* @param {Page} page - The Puppeteer page object.
* @param {string} scrollContainerSelector - The CSS selector of the container to scroll.
* @param {number} loops - The number of times to scroll the container.
*/
async function scrollContainer(page, scrollContainerSelector, loops) {
for (let i = 0; i < loops; i++) {
// Scroll the container to the bottom
await page.evaluate((selector) => {
const container = document.querySelector(selector);
if (container) {
// Log the current scroll position and scroll height
console.log(`Before scroll - Scroll Top: ${container.scrollTop}, Scroll Height: ${container.scrollHeight}`);
// Scroll to the bottom
container.scrollTo(0, container.scrollHeight);
// Log the new scroll position after scrolling
console.log(`After scroll - Scroll Top: ${container.scrollTop}, Scroll Height: ${container.scrollHeight}`);
}
}, scrollContainerSelector);
await delay(1000); // Wait for content to load
}
}
/**
* Scrapes images from the page, saves them to the specified directory, and downloads them.
* @param {Page} page - The Puppeteer page object.
* @param {string} scrollContainerSelector - The CSS selector of the container to scroll.
* @param {number} loops - The number of times to scroll the container.
* @param {string} district - The name of the district where the images are located.
* @param {string} distId - The ID of the district.
*/
async function scrapeAndSaveImages(page, scrollContainerSelector, loops, district, distId) {
// Scroll the container to load more images
await scrollContainer(page, scrollContainerSelector, loops);
const htmlContent = await page.content();
const $ = cheerio.load(htmlContent);
// Select elements with inline background-image styles
const imageUrls = $('.U39Pmb')
.map((_, element) => {
const style = $(element).attr('style');
if (style) {
// Extract URL from style attribute
const urlMatch = style.match(/url\(["']?(.*?)["']?\)/);
if (urlMatch && urlMatch[1]) {
const url = urlMatch[1];
// Remove everything after '=' in the URL (if needed)
const cleanedUrl = url.split('=')[0];
// Filter out invalid URLs
if (cleanedUrl && !cleanedUrl.startsWith('//:0') && cleanedUrl !== '""') {
return cleanedUrl;
}
}
}
return null;
})
.get() // Convert to an array
.filter((url) => url !== null); // Remove null values
console.log(imageUrls);
// Create a directory to save images
const dir = `./downloaded_images/${district}`;
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}
// Download and save images
for (let i = 0; i < imageUrls.length; i++) {
const url = imageUrls[i];
const filePath = path.join(dir, `${distId}_image_${i + 1}.jpg`);
try {
await downloadImage(url, filePath);
console.log(`Downloaded: ${filePath}`);
} catch (err) {
console.error(`Failed to download ${url}: ${err.message}`);
}
}
}
/**
* Main function to scrape local places information and save images.
*/
async function getLocalPlacesInfo(dists) {
// Launch a headless browser instance
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36');
for (let i = 0; i < dists.length; i++) {
const district = dists[i];
// Fetch data from the blogger info page
const datas = await getDataFromBloggerInfo(district.id.toLowerCase());
for (let i = 0; i < datas.length; i++) {
const data = datas[i];
try {
const location = data.detailMap['ସ୍ଥାନ']; // Accessing using the exact key
const id = data.id;
const district = data.district;
const locate = location.replace(/ /g, "+").replace(/,/g, "");
const URL = `${requestParams.baseURL}/maps/search/${locate}?hl=${requestParams.hl}`;
// Navigate to the Google Maps URL
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await delay(2000);
// Click on the first result
const selector = '.hfpxzc'; // Class name of the element to click
await clickElement(page, selector);
// Interact with the page and take a screenshot
await interactWithPageAndTakeScreenshot(page);
// Scroll and scrape images
const scrollContainerSelector = '.m6QErb[style]';
const loops = 50; // Number of times to scroll
await scrapeAndSaveImages(page, scrollContainerSelector, loops, district, id);
} catch (err) {
console.error(`Failed to download ${err.message}`);
}
}
}
// Close the browser
await browser.close();
}
/**
* Fetches data from the blogger info page and parses it.
* @returns {Promise
} - A promise that resolves to an array of data objects.
*/
async function getDataFromBloggerInfo(place) {
try {
const response = await axios.get(
`https://calenderapp.blogspot.com/2025/01/touristplaces-${place}.html`
);
const $ = cheerio.load(response.data);
const div = $(".post-body").text();
return JSON.parse(div);
} catch (error) {
console.error("Error fetching decoding parameters:", error);
throw error;
}
}
async function getDistricts() {
try {
const response = await axios.get(
`https://calenderapp.blogspot.com/2024/12/touristplace-districts.html`
);
const $ = cheerio.load(response.data);
const div = $(".post-body").text();
return JSON.parse(div);
} catch (error) {
console.error("Error fetching decoding parameters:", error);
throw error;
}
}
async function getImages() {
try {
const datas = await getDistricts();
await getLocalPlacesInfo(datas);
} catch (error) {
console.error("Error fetching decoding parameters:", error);
}
}
await getImages()
// Execute the main function
0 Comments