Web Scraping Google Images and Saving locally with Python

What will be scraped


Full Code

import os, requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup
from serpapi import GoogleSearch

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36"

params = {
    "q": "mincraft wallpaper 4k", # search query
    "tbm": "isch",                # image results
    "hl": "en",                   # language of the search
    "gl": "us",                   # country where search comes from
    "ijn": "0"                    # page number

html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")

def get_images_with_request_headers():
    del params["ijn"]
    params["content-type"] = "image/png" # parameter that indicate the original media type

    return [img["src"] for img in soup.select("img")]

def get_suggested_search_data():
    suggested_searches = []

    all_script_tags = soup.select("script")

    # https://regex101.com/r/48UZhY/6
    matched_images = "".join(re.findall(r"AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>", str(all_script_tags)))

    # https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
    # if you try to json.loads() without json.dumps it will throw an error:
    # "Expecting property name enclosed in double quotes"
    matched_images_data_fix = json.dumps(matched_images)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # search for only suggested search thumbnails related
    # https://regex101.com/r/ITluak/2
    suggested_search_thumbnails = ",".join(re.findall(r'{key(.*?)\[null,\"Size\"', matched_images_data_json))

    # https://regex101.com/r/MyNLUk/1
    suggested_search_thumbnail_encoded = re.findall(r'\"(https:\/\/encrypted.*?)\"', suggested_search_thumbnails)

    for suggested_search, suggested_search_fixed_thumbnail in zip(soup.select(".PKhmud.sc-it.tzVsfd"), suggested_search_thumbnail_encoded):
            "name": suggested_search.select_one(".VlHyHc").text,
            "link": f"https://www.google.com{suggested_search.a['href']}",
            # https://regex101.com/r/y51ZoC/1
            "chips": "".join(re.findall(r"&chips=(.*?)&", suggested_search.a["href"])),
            # https://stackoverflow.com/a/4004439/15164646 comment by Frรฉdรฉric Hamidi
            "thumbnail": bytes(suggested_search_fixed_thumbnail, "ascii").decode("unicode-escape")

    return suggested_searches

def get_original_images():

    if you try to json.loads() without json.dumps() it will throw an error:
    "Expecting property name enclosed in double quotes"

    google_images = []

    all_script_tags = soup.select("script")

    # # https://regex101.com/r/48UZhY/4
    matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))

    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # https://regex101.com/r/VPz7f2/1
    matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)

    # https://regex101.com/r/NnRg27/1
    matched_google_images_thumbnails = ", ".join(
                   str(matched_google_image_data))).split(", ")

    thumbnails = [
        bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))

    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)

    full_res_images = [
        bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images

    for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1):
            "title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
            "link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
            "source": metadata.select_one(".fxgdke").text,
            "thumbnail": thumbnail,
            "original": original

        # Download original images
        print(f'Downloading {index} image...')

        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36')]

        urllib.request.urlretrieve(original, f'Bs4_Images/original_size_img_{index}.jpg')

    return google_images
Install libraries:

pip install requests bs4 google-search-results
google-search-results is a SerpApi API package that will be shown at the end as an alternative solution.

Basic knowledge scraping with CSS selectors

CSS selectors declare which part of the markup a style applies to thus allowing to extract data from matching tags and attributes.

If you haven't scraped with CSS selectors, there's a dedicated blog post of mine
about how to use CSS selectors when web-scraping that covers what it is, its pros and cons, and why they matter from a web-scraping perspective.

Reduce the chance of being blocked

There's a chance that a request might be blocked. Have a look
at how to reduce the chance of being blocked while web-scraping, there are eleven methods to bypass blocks from most websites.

Make sure to pass User-Agent, because Google might block your requests eventually and you'll receive a different HTML thus empty output.

User-Agent identifies the browser, its version number, and its host operating system that represents a person (browser) in a Web context that lets servers and network peers identify if it's a bot or not. And we're faking "real" user visit. Check what is your user-agent.

Code Explanation

Import libraries:

import os, requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup
from serpapi import GoogleSearch
Library Purpose
os to return environment variable (SerpApi API key) value.
requests to make a request to the website.
lxml to process XML/HTML documents fast.
json to convert extracted data to a JSON object.
re to extract parts of the data via regular expression.
urllib.request to save images locally with urllib.request.urlretrieve
BeautifulSoup is a XML/HTML scraping library. It's used in combo with lxml as it faster than html.parser

Create URL parameter and request headers:

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36"

params = {
    "q": "mincraft wallpaper 4k", # search query
    "tbm": "isch",                # image results
    "hl": "en",                   # language of the search
    "gl": "us",                   # country where search comes from
    "ijn": "0"                    # page number
Code Explanation
params a prettier way of passing URL parameters to a request.
user-agent to act as a "real" user request from the browser by passing it to request headers. Default requests user-agent is a python-reqeusts so websites might understand that it's a bot or a script and block the request to the website. Check what's your user-agent.

Make a request, pass created request parameters and headers. Pass returned HTML to BeautifulSoup:

html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
Code Explanation
timeout=30 to stop waiting for response after 30 seconds.
BeautifulSoup(html.text, "lxml") html.text will return a textual HTML data and "lxml" will be set as a XML/HTML processor, not the default html.parser

Extracting data with request headers only, no regular expression the moment:

def get_images_with_request_headers():
    params["content-type"] = "image/png" # parameter that indicate the original media type 

    return [img["src"] for img in soup.select("img")]
The reason why it's handy is beacuse when you try directly parse data from img tag and src attriubte, you'll get a base64 encoded URL which will be a 1x1 image placeholder. Not a particularly useful image resolution ๐Ÿ™‚

Code Explanation
params["content-type"] will create a new dict key "content-type" and assinged a "image/png" value which will return images.
[img["src"] for img in soup.select("img")] will iterate over all img tags and extracts src attriubte in a list comprehension loop and returned value would be a list of URLs from src attriubte.

Print returned data:

Now to the suggested search results, a thing above actual images:

def get_suggested_search_data():
    if you try to json.loads() without json.dumps it will throw an error:
    "Expecting property name enclosed in double quotes"

    suggested_searches = []

    all_script_tags = soup.select("script")

    # https://regex101.com/r/48UZhY/6
    matched_images = "".join(re.findall(r"AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>", str(all_script_tags)))

    matched_images_data_fix = json.dumps(matched_images)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # search for only suggested search thumbnails related
    # https://regex101.com/r/ITluak/2
    suggested_search_thumbnails = ",".join(re.findall(r'{key(.*?)\[null,\"Size\"', matched_images_data_json))

    # https://regex101.com/r/MyNLUk/1
    suggested_search_thumbnail_encoded = re.findall(r'\"(https:\/\/encrypted.*?)\"', suggested_search_thumbnails)

    # zip() is used on purpose over zip_longest() as number of results would be identical
    for suggested_search, suggested_search_fixed_thumbnail in zip(soup.select(".PKhmud.sc-it.tzVsfd"), suggested_search_thumbnail_encoded):
            "name": suggested_search.select_one(".VlHyHc").text,
            "link": f"https://www.google.com{suggested_search.a['href']}",
            # https://regex101.com/r/y51ZoC/1
            "chips": "".join(re.findall(r"&chips=(.*?)&", suggested_search.a["href"])),
            # https://stackoverflow.com/a/4004439/15164646 comment by Frรฉdรฉric Hamidi
            "thumbnail": bytes(suggested_search_fixed_thumbnail, "ascii").decode("unicode-escape")

    return suggested_searches
Code Explanation
suggested_searches a temporary list where extracted data will be appended at the end of the function.
all_script_tags a variable which will hold all extracted <script> HTML tags from soup.select("script") where select() will return a list of matched <script> tags.
matched_images will hold all extracted matched images data from re.findall() which returns an iterator. This variable is needed to extract suggested search thumbnails, image thumbnails and full-resolution images.
suggested_search_thumbnails and suggested_search_thumbnail_encoded parses part of inline JSON where suggested_search_thumbnail_encoded parses actual thumbnails from partly parsed inline JSON data.
zip() to iterate over multiple iterables in parralel. Keep in mind that zip is used on purpose. zip() ends with the shortest iterator while zip_longest() iterates up to the length of the longest iterator.
suggested_searches.append({}) to append extracted images data to a list as a dictionary.
select_one() to return one (instead of all) matched element in a loop.
["href"] is a shortcut of accessing and extracting HTML attributes with BeautifulSoup. Alternative is get(<attribute>).
"".join() to join all items from in iterable into a string.
bytes(<variable>, "ascii").decode("unicode-escape") to decode parsed image data.

Printed returned data:

    "name": "ultra hd",
    "link": "https://www.google.com/search?q=minecraft+wallpaper+4k&tbm=isch&hl=en&gl=us&chips=q:minecraft+wallpaper+4k,g_1:ultra+hd:5VuluDYWa8Y%3D&sa=X&ved=2ahUKEwjshdCK0Yn5AhXrlWoFHYhyCrQQ4lYoAHoECAEQHQ",
    "chips": "q:minecraft+wallpaper+4k,g_1:ultra+hd:5VuluDYWa8Y%3D",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcThU0xo_GeIciyaBmvE6EI46tnj0npeDAmDsLKjYlnv4tGz0eaw&usqp=CAU"
    "name": "epic",
    "link": "https://www.google.com/search?q=minecraft+wallpaper+4k&tbm=isch&hl=en&gl=us&chips=q:minecraft+wallpaper+4k,g_1:epic:5c56RYLjq2c%3D&sa=X&ved=2ahUKEwjshdCK0Yn5AhXrlWoFHYhyCrQQ4lYoAXoECAEQHw",
    "chips": "q:minecraft+wallpaper+4k,g_1:epic:5c56RYLjq2c%3D",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ_bUq-7tk9FyeNSW40Yo8FRY6SOViMbUeme_ln1uMwxcTdfI6d&usqp=CAU"
  }, ... other results
Extracting original resolution images:

def get_original_images():

    if you try to json.loads() without json.dumps() it will throw an error:
    "Expecting property name enclosed in double quotes"

    google_images = []

    all_script_tags = soup.select("script")

    # # https://regex101.com/r/48UZhY/4
    matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))

    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # https://regex101.com/r/VPz7f2/1
    matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)

    # https://regex101.com/r/NnRg27/1
    matched_google_images_thumbnails = ", ".join(
                   str(matched_google_image_data))).split(", ")

    thumbnails = [
        bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))

    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)

    full_res_images = [
        bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images

    for index, (metadata, thumbnail, original) in enumerate(zip(soup.select(".isv-r.PNCib.MSM1fd.BUooTd"), thumbnails, full_res_images), start=1):
            "title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
            "link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
            "source": metadata.select_one(".fxgdke").text,
            "thumbnail": thumbnail,
            "original": original

        # Download original images
        print(f"Downloading {index} image...")

        opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")]

        urllib.request.urlretrieve(original, f"Bs4_Images/original_size_img_{index}.jpg")

    return google_images
The process is almost identical to extracting suggested search results except for different regular expressions:

1. Create a temporary list google_images where extracted data will be appended.

2. Extracting all_script_tags.

3. Extracting matched_images_data to extract thumbnails and original resolution images.

4. Decode extracted encoded thumbnails:

thumbnails = [
    bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails

# equvalent to 
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
    # https://stackoverflow.com/a/4004439/15164646 comment by Frรฉdรฉric Hamidi
    google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
    # after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
    google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
5. Decode extracted encoded full_res_images:

full_res_images = [
      bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images

# equvalent to
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
    # https://stackoverflow.com/a/4004439/15164646 comment by Frรฉdรฉric Hamidi
    original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
    original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
Save full resolution images locally:

opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")]

urllib.request.urlretrieve(original, f"Bs4_Images/original_size_img_{index}.jpg")
Code Explanation
urllib.request.build_opener() manages the chaining of handlers and will automatically add headers on each request (row below).
opener.addheaders[()] to add headers to the request.
urllib.install_opener() set opener as a default global opener. Whatever that means ๐Ÿ‘€
urllib.request.urlretrieve() to save images locally.

Printed returned data:

    "title": "4K Minecraft Wallpapers | Background Images",
    "link": "https://wall.alphacoders.com/tag/4k-minecraft-wallpapers",
    "source": "wall.alphacoders.com",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSJxrGh1FUsvCRNgKI4aiM8CimALQ0rHU2SDigSRl6X1c7BiWDOUMMMVCwyKtufB9SEddw&usqp=CAU",
    "original": "https://images6.alphacoders.com/108/thumb-1920-1082090.jpg"
    "title": "Best Minecraft Wallpaper 4k - Minecraft Tutos",
    "link": "https://minecraft-tutos.com/en/minecraft-wallpaper/",
    "source": "minecraft-tutos.com",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTRDMguXava6khO5e5A0GQsm5v64rrJI_tYuSaJjyxWQNhTrhRWPRLLuhtPVouOUSaqzC0&usqp=CAU",
    "original": "https://minecraft-tutos.com/wp-content/uploads/2022/03/wallpaper-minecraft-alex-steve-universe.jpeg"
  }, ... other results
Using Google Images API

The main difference is that it's a quicker approach. No need to figure out regular expressions, create a parser and maintain it over time, or how to scale the number of requests without being blocked.

Example with pagination and multiple search queries:

def serpapi_get_google_images():
    image_results = []

    for query in ["Coffee", "boat", "skyrim", "minecraft"]:
        # search query parameters
        params = {
            "engine": "google",               # search engine. Google, Bing, Yahoo, Naver, Baidu...
            "q": query,                       # search query
            "tbm": "isch",                    # image results
            "num": "100",                     # number of images per page
            "ijn": 0,                         # page number: 0 -> first page, 1 -> second...
            "api_key": os.getenv("API_KEY")   # your serpapi api key
            # other query parameters: hl (lang), gl (country), etc  

        search = GoogleSearch(params)         # where data extraction happens

        images_is_present = True
        while images_is_present:
            results = search.get_dict()       # JSON -> Python dictionary

            # checks for "Google hasn't returned any results for this query."
            if "error" not in results:
                for image in results["images_results"]:
                    if image["original"] not in image_results:

                # update to the next page
                params["ijn"] += 1
                images_is_present = False

    # -----------------------
    # Downloading images

    for index, image in enumerate(results["images_results"], start=1):
        print(f"Downloading {index} image...")

        opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36")]

        urllib.request.urlretrieve(image["original"], f"SerpApi_Images/original_size_img_{index}.jpg")

    print(json.dumps(image_results, indent=2))
2349 # number of total extracted images
