구글 이미지 검색 후 크롤링

goodthings4me.tistory.com

'용산 차막힘' 관련 이미지를 검색하다가 구글 이미지 검색 후 다운로드하는 파이썬 크롤링 코드를 만들어 보기로 했다. 2021.10월에 이미지 구글링 관련 포스팅 글(구글에서 원하는 이미지 다운로드)에 이은 두번째 크롤링 코드다.

[파이썬 크롤링] 구글 이미지 다운로드

이전 포스팅과 차이점은

크롬 브라우저의 버전 부분에 맞는 selenium 크롬 웹 드라이버를 자동으로 설치되도록 모듈을 적용하였고,
selenium 업그레이드로 html 요소 추출하는 코드를 버전에 맞게 적용하였다.
추출할 이미지 숫자를 지정하면 그 숫자까지만 추출한다.

1. 페이지 스크롤링 안 한 상태에서 이미지 추출하는 파이썬 코드

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import os, time, random
from bs4 import BeautifulSoup
import urllib.request


def chromeWebdriver():
    options = Options()
    options.add_argument("lang=ko_KR")  # 언어 설정
    # options.add_argument("start-maximized") # 창 크기 최대로
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")    
    options.add_experimental_option('detach', True)  # 브라우저 안 닫히게
    options.add_experimental_option('excludeSwitches', ['enable-logging'])  # 시스템 장치 에러 숨기기
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
    options.add_argument(f'user-agent={user_agent}')    
    # options.add_argument('--headless')  # 웹 브라우저를 시각적으로 띄우지 않는 headless chrome 옵션
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) 
    return driver


def collect_image(search_word):
    url = 'https://www.google.co.kr'

    now = time.localtime()
    today_time = f'{now.tm_year}{now.tm_mon}{now.tm_mday}_{now.tm_hour}{now.tm_min}{now.tm_sec}'
    print(today_time)

    file_path = "c:\\temp\\"

    os.chdir(file_path)
    os.makedirs(file_path + today_time + '_' + search_word)
    os.chdir(file_path + today_time + '_' + search_word)
    file_save_dir = file_path + today_time + '_' + search_word
    print(file_save_dir)

    driver = chromeWebdriver()
    driver.get(url)
    time.sleep(random.uniform(2, 3))
    elem_q = driver.find_element(By.NAME, 'q')
    elem_q.send_keys(search_word)
    elem_q.submit()

    driver.find_element(By.LINK_TEXT, '이미지').click()  # 텍스트 메뉴 '이미지' 링크 클릭
    # driver.find_element(By.XPATH, '//*[@id="hdtb-msb"]/div[1]/div/div[2]/a').click()
    time.sleep(random.uniform(1, 2))

    file_no = 1
    count = 1
    img_src = []

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    imgs = driver.find_elements(By.CSS_SELECTOR, '#islrg > div.islrc > div a.wXeWr.islib.nfEiy')
    print(len(imgs))

    for img in imgs:
        img_src1 = img.click()  # 이미지 클릭 시 display 되는 url을 찾기 위해 클릭함
        img_src2 = driver.find_element(By.CSS_SELECTOR, '#Sva75c > div > div > div.pxAole > div.tvh9oe.BIB1wf > c-wiz > div > div.OUZ5W > div.zjoqD > div.qdnLaf.isv-id > div > a')
        time.sleep(random.uniform(0.2, 0.5))
        img_src3 = img_src2.find_element(By.TAG_NAME, 'img').get_attribute('src')
        if img_src3[:4] != 'http':
            continue
        print(count, img_src3, '\n')
        img_src.append(img_src3)
        count += 1

    for i in range(len(img_src)):
        extention = img_src[i].split('.')[-1]
        ext = ''
        print(extention)
        if extention in ('jpg', 'JPG', 'jpeg', 'JPEG', 'png', 'PNG', 'gif', 'GIF'):
            ext = '.' + extention
        else:
            ext = '.jpg'        
        try:
            urllib.request.urlretrieve(img_src[i], str(file_no).zfill(3) + ext)
            print(img_src[i])
        except Exception:
            continue
        file_no += 1
        # time.sleep(random.uniform(0.1, 0.5))
        print(f'{file_no}번째 이미지 저장-----')

    driver.close()


if __name__ == '__main__':
    collect_image('고양이')

C:\ 드라이브 Temp 폴더에 이미지 저장 폴더를 생성한다.
time.localtime() 모듈의 반환값 time.struct_time(tm_year=2022, tm_mon=5, tm_mday=20, tm_hour=9, tm_min=37, tm_sec=51, tm_wday=4, tm_yday=140, tm_isdst=0)을 이용해서 폴더 이름을 만드는데 사용한다.
구글에서 "고양이"로 검색한 후 selenium 크롬 웹 드라이버로 검색된 이미지 전체 요소를 추출한다.
구글 이미지는 클릭 시 우측에 원본 이미지가 다시 표시되는데, 이때 클릭된 이미지 요소를 개발자 도구(F12)에서 확인해보면 원본 이미지의 html 코드 하단에 display 된다.
이미지 링크가 있는 <img> 태그를 직접 지정하면 어떤 원인인지 몰라도 에러가 발생하여 바로 위 <a> 태그를 기반으로 <img>에 접근하여 속성 src를 가져왔다. (이때, 인터벌이 필요하여 time 적용 후 실행함)
구글의 이미지 특성상 추출 결과가 data:image/jpeg;base64,/9j/4AAQS ~ 로 된 src는 작은 이미지일 가능성이 있어서 리스트 슬라이싱으로 비교하여 http로 시작되는 url만 추출함.
이미지 확장자는 'jpg', 'JPG', 'jpeg', 'JPEG', 'png', 'PNG', 'gif', 'GIF'인 경우는 그대로 붙이고 나머지는 'jpg'로 붙임
추출된 이미지 URL 리스트를 대상으로 urllib.request.urlretrieve()를 활용하여 이미지를 저장한다.
크롬 브라우저를 띄우지 않고 작업하려면 주석 처리된 # options.add_argument('--headless')의 주석을 푼다.

[실행 결과]

2. 페이지 스크롤링 후 이미지 추출하는 파이썬 코드

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import os, time, random
from bs4 import BeautifulSoup
import urllib.request


def chromeWebdriver():
    options = Options()
    options.add_argument("lang=ko_KR")  # 언어 설정
    # options.add_argument("start-maximized") # 창 크기 최대로
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")    
    options.add_experimental_option('detach', True)  # 브라우저 안 닫히게
    options.add_experimental_option('excludeSwitches', ['enable-logging'])  # 시스템 장치 에러 숨기기
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
    options.add_argument(f'user-agent={user_agent}')    
    # options.add_argument('--headless')  # 웹 브라우저를 시각적으로 띄우지 않는 headless chrome 옵션
    driver = webdriver.Chrome(service=Service(executable_path=ChromeDriverManager().install()), options=options)
    return driver


def collect_image(search_word, extract_img_count):
    url = 'https://www.google.co.kr'

    now = time.localtime()
    today_time = f'{now.tm_year}{now.tm_mon}{now.tm_mday}_{now.tm_hour}{now.tm_min}'
    print(today_time)

    file_path = "c:\\temp\\"

    os.chdir(file_path)
    os.makedirs(file_path + today_time + '_' + search_word)
    os.chdir(file_path + today_time + '_' + search_word)
    file_save_dir = file_path + today_time + '_' + search_word
    print(file_save_dir)

    driver = chromeWebdriver()
    driver.get(url)
    time.sleep(random.uniform(2, 3))
    elem_q = driver.find_element(By.NAME, 'q')
    elem_q.send_keys(search_word)
    elem_q.submit()

    driver.find_element(By.LINK_TEXT, '이미지').click()  # 텍스트 메뉴 '이미지' 링크 클릭
    # driver.find_element(By.XPATH, '//*[@id="hdtb-msb"]/div[1]/div/div[2]/a').click()
    time.sleep(random.uniform(1, 2))

    # 페이지 스크롤 다운
    def page_scrolling(drivers):
        ## scrolling ------------------------------
        elem = driver.find_element(By.TAG_NAME, 'body')
        page_height = driver.execute_script('return document.body.scrollHeight')
        # print(page_height)

        # more_view_cnt = 0
        scroll_cnt = 1
        more_view_scroll_cnt = -1  # '결과 더보기' 버튼 나올 때의 scroll_cnt (break 처리 위해 사용)
        equal_cnt = 1
        while True:
            elem.send_keys(Keys.PAGE_DOWN)
            time.sleep(random.uniform(0.3, 0.5))
            new_height = driver.execute_script('return document.body.scrollHeight')
            if page_height != new_height:
                page_height = new_height
                equal_cnt = 1
            print(f'scroll_cnt: {scroll_cnt}, new_height: {new_height}, equal_cnt: {equal_cnt}')
            
            try:
                scroll_cnt += 1
                equal_cnt += 1
                driver.find_element(By.XPATH, '//*[@id="islmp"]/div/div/div/div[1]/div[2]/div[2]/input').click()  # 결과 더보기 버튼 처리
                print('결과 더보기 버튼 클릭 처리')
                more_view_scroll_cnt = scroll_cnt
                more_view_cnt += 1
            except:
                if equal_cnt == 20:  # scroll_cnt / more_view_scroll_cnt > 2.5:
                    break
                continue
        ## End of scrolling ------------------------------

    page_scrolling(driver)

    file_no = 1
    count = 1
    img_src = []

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    # print(soup)
    # imgs = driver.find_elements(By.TAG_NAME, 'img')
    imgs = driver.find_elements(By.CSS_SELECTOR, '#islrg > div.islrc > div a.wXeWr.islib.nfEiy')
    print(len(imgs))

    for img in imgs:
        img_src1 = img.click()  # 이미지 클릭 시 display 되는 url을 찾기 위해 클릭함
        try:
            img_src2 = driver.find_element(By.CSS_SELECTOR, '#Sva75c > div > div > div.pxAole > div.tvh9oe.BIB1wf > c-wiz > div > div.OUZ5W > div.zjoqD > div.qdnLaf.isv-id > div > a')
        except Exception:
            continue
        time.sleep(random.uniform(0.2, 0.5))
        img_src3 = img_src2.find_element(By.TAG_NAME, 'img').get_attribute('src')
        if img_src3[:4] != 'http':
            continue
        print(count, img_src3, '\n')

        img_src.append(img_src3)
        if count == extract_img_count + 10:  # 이미지 에러 대비해서 입력 숫자보다 크게 잡음
            break        
        count += 1
        
    print(f'\n{"="*10} 추출한 전체 리스트 {"="*10}\n{img_src}\n\n{"="*10}총 {len(img_src)}개 추출함{"="*10}\n')

    for i in range(len(img_src)):
        extention = img_src[i].split('.')[-1]
        ext = ''
        if extention in ('jpg', 'JPG', 'jpeg', 'JPEG', 'png', 'PNG', 'gif', 'GIF'):
            ext = '.' + extention
        else:
            ext = '.jpg'        
        try:
            urllib.request.urlretrieve(img_src[i], str(file_no).zfill(3) + ext)
            print(img_src[i])
        except Exception:
            continue

        print(f'{file_no}번째 이미지 저장-----')
        file_no += 1
        
        if file_no - 1 == extract_img_count:
            break

    driver.close()


if __name__ == '__main__':
    collect_image('고양이', 200)

구글 이미지 검색은 스크롤링을 하면 더 많은 이미지를 볼 수 있는데, 어느정도의 스크롤링 후 '결과 더보기' 버튼이 나온다. 이 버튼을 클릭한 후 다시 스크롤링을 하는데, '결과 더보기' 버튼이 2번은 안 나오는 것 같다.
구글 검색 결과가 나오는 페이지의 상단 메뉴 중 '이미지' 링크 순서(전체 | 이미지 | 동영상 ~ 순서가 전체 | 동영상 | 이미지 등의 순서)가 바뀌면 안되기 때문에 LINE_TEXT로 하였으며, 이를 해결하는 방법은 구글 이미지 검색 URL을 이용하면 해결된다.

저작자표시 비영리 변경금지

'코딩 연습 > 파이썬 크롤링' 카테고리의 다른 글

다음 뉴스 기사 제목 본문 크롤링 후 텍스트 저장 (0)	2022.06.01
온채널 도매 사이트 명예의전당 우수상품 리스트 (0)	2022.05.24
네이버 인플루언서 탭에서 인플루언서의 이름과 팬 숫자 추출하기 (0)	2022.05.18
네이버 뉴스 크롤링 - 기사 제목과 링크(URL) 추출 (0)	2022.05.16
티스토리 블로그 내 이미지 다운로드 (0)	2022.05.10

구글 이미지 검색 후 크롤링

[파이썬 크롤링] 구글 이미지 다운로드

1. 페이지 스크롤링 안 한 상태에서 이미지 추출하는 파이썬 코드

[실행 결과]

2. 페이지 스크롤링 후 이미지 추출하는 파이썬 코드

'코딩 연습 > 파이썬 크롤링' 카테고리의 다른 글

댓글

티스토리툴바

구글 이미지 검색 후 크롤링

[파이썬 크롤링] 구글 이미지 다운로드

1. 페이지 스크롤링 안 한 상태에서 이미지 추출하는 파이썬 코드

[실행 결과]

2. 페이지 스크롤링 후 이미지 추출하는 파이썬 코드

'코딩 연습 > 파이썬 크롤링' 카테고리의 다른 글

관련글

댓글

티스토리툴바