Google Map Image Scrapper

import axios from 'axios'; import * as cheerio from 'cheerio'; import puppeteer from "puppeteer-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import path from 'path'; import fs from 'fs'; // Use the StealthPlugin to avoid detection by websites puppeteer.use(StealthPlugin()); // Configuration parameters for the request const requestParams = { baseURL: `http://google.com`, // Base URL for Google Maps query: "Sarafgarh+Dam,+5QM6%2B2F6,+Odisha+770012", // Query string for the search coordinates: "", // GPS coordinates for the search (if needed) hl: "en", // Language parameter for the search }; // Utility function to introduce a delay const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); /** * Clicks an element on the page identified by the given selector. * @param {Page} page - The Puppeteer page object. * @param {string} selector - The CSS selector of the element to click. */ async function clickElement(page, selector) { // Wait for the element to be visible await page.waitForSelector(selector, { visible: true }); // Click the element await page.click(selector); } /** * Interacts with the page by clicking the "All" button and waits for the content to load. * @param {Page} page - The Puppeteer page object. */ async function interactWithPageAndTakeScreenshot(page) { // Click the button with aria-label="All" await page.waitForSelector('button[aria-label="All"]', { visible: true }); await page.click('button[aria-label="All"]'); // Wait for any additional loading after clicking the button await delay(2000); } /** * Downloads an image from the given URL and saves it to the specified file path. * @param {string} url - The URL of the image to download. * @param {string} filePath - The path where the image will be saved. * @returns {Promise} - A promise that resolves when the image is downloaded. */ async function downloadImage(url, filePath) { const response = await axios({ url, responseType: 'stream', }); response.data.pipe(fs.createWriteStream(filePath)); return new Promise((resolve, reject) => { response.data.on('end', () => resolve()); response.data.on('error', (err) => reject(err)); }); } /** * Scrolls a container on the page a specified number of times. * @param {Page} page - The Puppeteer page object. * @param {string} scrollContainerSelector - The CSS selector of the container to scroll. * @param {number} loops - The number of times to scroll the container. */ async function scrollContainer(page, scrollContainerSelector, loops) { for (let i = 0; i < loops; i++) { // Scroll the container to the bottom await page.evaluate((selector) => { const container = document.querySelector(selector); if (container) { // Log the current scroll position and scroll height console.log(`Before scroll - Scroll Top: ${container.scrollTop}, Scroll Height: ${container.scrollHeight}`); // Scroll to the bottom container.scrollTo(0, container.scrollHeight); // Log the new scroll position after scrolling console.log(`After scroll - Scroll Top: ${container.scrollTop}, Scroll Height: ${container.scrollHeight}`); } }, scrollContainerSelector); await delay(1000); // Wait for content to load } } /** * Scrapes images from the page, saves them to the specified directory, and downloads them. * @param {Page} page - The Puppeteer page object. * @param {string} scrollContainerSelector - The CSS selector of the container to scroll. * @param {number} loops - The number of times to scroll the container. * @param {string} district - The name of the district where the images are located. * @param {string} distId - The ID of the district. */ async function scrapeAndSaveImages(page, scrollContainerSelector, loops, district, distId) { // Scroll the container to load more images await scrollContainer(page, scrollContainerSelector, loops); const htmlContent = await page.content(); const $ = cheerio.load(htmlContent); // Select elements with inline background-image styles const imageUrls = $('.U39Pmb') .map((_, element) => { const style = $(element).attr('style'); if (style) { // Extract URL from style attribute const urlMatch = style.match(/url\(["']?(.*?)["']?\)/); if (urlMatch && urlMatch[1]) { const url = urlMatch[1]; // Remove everything after '=' in the URL (if needed) const cleanedUrl = url.split('=')[0]; // Filter out invalid URLs if (cleanedUrl && !cleanedUrl.startsWith('//:0') && cleanedUrl !== '""') { return cleanedUrl; } } } return null; }) .get() // Convert to an array .filter((url) => url !== null); // Remove null values console.log(imageUrls); // Create a directory to save images const dir = `./downloaded_images/${district}`; if (!fs.existsSync(dir)) { fs.mkdirSync(dir); } // Download and save images for (let i = 0; i < imageUrls.length; i++) { const url = imageUrls[i]; const filePath = path.join(dir, `${distId}_image_${i + 1}.jpg`); try { await downloadImage(url, filePath); console.log(`Downloaded: ${filePath}`); } catch (err) { console.error(`Failed to download ${url}: ${err.message}`); } } } /** * Main function to scrape local places information and save images. */ async function getLocalPlacesInfo(dists) { // Launch a headless browser instance const browser = await puppeteer.launch({ headless: true, args: ["--no-sandbox", "--disable-setuid-sandbox"], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'); for (let i = 0; i < dists.length; i++) { const district = dists[i]; // Fetch data from the blogger info page const datas = await getDataFromBloggerInfo(district.id.toLowerCase()); for (let i = 0; i < datas.length; i++) { const data = datas[i]; try { const location = data.detailMap['ସ୍ଥାନ']; // Accessing using the exact key const id = data.id; const district = data.district; const locate = location.replace(/ /g, "+").replace(/,/g, ""); const URL = `${requestParams.baseURL}/maps/search/${locate}?hl=${requestParams.hl}`; // Navigate to the Google Maps URL await page.setDefaultNavigationTimeout(60000); await page.goto(URL); await delay(2000); // Click on the first result const selector = '.hfpxzc'; // Class name of the element to click await clickElement(page, selector); // Interact with the page and take a screenshot await interactWithPageAndTakeScreenshot(page); // Scroll and scrape images const scrollContainerSelector = '.m6QErb[style]'; const loops = 50; // Number of times to scroll await scrapeAndSaveImages(page, scrollContainerSelector, loops, district, id); } catch (err) { console.error(`Failed to download ${err.message}`); } } } // Close the browser await browser.close(); } /** * Fetches data from the blogger info page and parses it. * @returns {Promise} - A promise that resolves to an array of data objects. */ async function getDataFromBloggerInfo(place) { try { const response = await axios.get( `https://calenderapp.blogspot.com/2025/01/touristplaces-${place}.html` ); const $ = cheerio.load(response.data); const div = $(".post-body").text(); return JSON.parse(div); } catch (error) { console.error("Error fetching decoding parameters:", error); throw error; } } async function getDistricts() { try { const response = await axios.get( `https://calenderapp.blogspot.com/2024/12/touristplace-districts.html` ); const $ = cheerio.load(response.data); const div = $(".post-body").text(); return JSON.parse(div); } catch (error) { console.error("Error fetching decoding parameters:", error); throw error; } } async function getImages() { try { const datas = await getDistricts(); await getLocalPlacesInfo(datas); } catch (error) { console.error("Error fetching decoding parameters:", error); } } await getImages() // Execute the main function

0 Comments