import asyncio, json
from datetime import datetime
from playwright.async_api import async_playwright
QUERY = "sunset mountains"
RESULTS = 100
SCROLL_PAUSE = 1.2
PROXY = "http://username:password@HOST:PORT" # Ping proxy (add geo/session params if needed)
async def run():
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
proxy={"server": PROXY} if PROXY else None
)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
viewport={"width": 1366, "height": 900}
)
page = await context.new_page()
await page.goto("https://www.google.com/imghp", wait_until="domcontentloaded")
# Handle consent dialog
try:
await page.click("button:has-text('I agree')", timeout=2000)
except:
pass
await page.fill("input[aria-label='Search for images']", QUERY)
await page.keyboard.press("Enter")
await page.wait_for_selector("div#islmp")
collected = set()
last_height = 0
while len(collected) < RESULTS:
thumbs = await page.locator("img[jsname='Q4LuWd']").element_handles()
for t in thumbs:
src = await t.get_attribute("src")
data_src = await t.get_attribute("data-src")
if src: collected.add(src)
if data_src: collected.add(data_src)
if len(collected) >= RESULTS:
break
await page.mouse.wheel(0, 3000)
await asyncio.sleep(SCROLL_PAUSE)
try:
await page.click("text=Show more results", timeout=1000)
except:
pass
new_height = await page.evaluate("document.body.scrollHeight")
if new_height == last_height and len(collected) >= RESULTS:
break
last_height = new_height
data = [{"query": QUERY, "img": url, "scraped_at": datetime.utcnow().isoformat()}
for url in list(collected)[:RESULTS]]
with open("google_images_urls.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
await browser.close()
asyncio.run(run())