1. 이미지 수집하기

* [픽사베이](https://pixabay.com/ko/)

 

import chromedriver_autoinstaller
import time
from selenium import webdriver
from urllib.request import Request, urlopen
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


driver = webdriver.Chrome()
url = 'https://pixabay.com/ko/images/search/음식/'
driver.get(url)

 

 

# 이미지가 있는 주소창 xpath
image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div'

# 예시용 하나의 이미지 xpath
image_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div/div/div[1]/div[1]/div/a/img'

 

# 예시용 하나의 이미지 url가져오기

image_url = driver.find_element('xpath',image_xpath).get_attribute('src')
print('image_url:', image_url)

 

# 예시용 하나의 이미지를 나의 컴퓨터 파일안에 저장하기

- 사진을 pic.jpg 라는 이름으로 저장

image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64)'})
f = open('pic.jpg','wb') # w: write, r: read, a: append, b: binary, t: text
f.write(urlopen(image_byte).read()) 
f.close()

 

 

 

2. 여러개 이미지 수집하기

 

from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
url = 'https://pixabay.com/ko/images/search/음식/'
driver.get(url)
time.sleep(3)


image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div'
image_area = driver.find_element(By.XPATH, image_area_xpath)
image_elements = image_area.find_elements(By.TAG_NAME, 'img')

image_urls = [] #내용을 담을 빈 리스트

for image_element in image_elements:
    image_url = image_element.get_attribute('data-lazy-src')
    if image_url is None: # 주소가 없을시 src로 주소 뽑기
        image_url = image_element.get_attribute('src')
    print(image_url)
    image_urls.append(image_url) # image_urls 리스트에 주소 추가

 

from urllib import parse
import os

for i in range(len(image_urls)):
    image_url = image_urls[i]
    url = parse.urlparse(image_url) # 이미지 경로를 가져옴 / parse: 접속해서 데이터를 가져온후 저장까지 해줌 
    name, ext = os.path.splitext(url.path) #파일이름, 확장명

    image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64)'})
    f = open(f'pic{i}.jpg','wb') # w: write, r: read, a: append, b: binary, t: text
    f.write(urlopen(image_byte).read()) 
    f.close()

 

 

 

3. 함수로 리팩토링

  • crawl_and_save_image(keyword, pages)
  • os.mkdir('음식') /open('음식/파일이름.jpg', 'wb')

 

def crawl_and_save_image(keyword, pages):
    image_urls = []
    for i in range(1, pages+1):
        url = f'https://pixabay.com/ko/images/search/{keyword}/?pagi={i}'
        # print(url)
        
        driver.get(url)
        time.sleep(3)
        image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]/div'
        image_area = driver.find_element(By.XPATH, image_area_xpath)
        image_elements = image_area.find_elements(By.TAG_NAME, 'img')
        
        for image_element in image_elements:
            image_url = image_element.get_attribute('data-lazy-src')
            if image_url is None:
                image_url = image_element.get_attribute('src')
            print(image_url)
            image_urls.append(image_url)
    
    
    if not os.path.exists(keyword):
        os.mkdir(keyword)
    
    
    for i in range(len(image_urls)):
        image_url = image_urls[i]
        
        # https://cdn.pixabay.com/photo/2016/12/26/17/28/spaghetti-1932466_1280.jpg
        filename = image_url.split('/')[-1]
        image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'})
        f = open(f'{keyword}/{filename}', 'wb')
        f.write(urlopen(image_byte).read())
        f.close()
driver = webdriver.Chrome()
crawl_and_save_image('음식', 2)

'컴퓨터 비전 > 웹 서비스' 카테고리의 다른 글

FastAPI  (0) 2023.12.16
Streamlit  (2) 2023.12.03
인스타그램  (1) 2023.12.03
셀레니움  (0) 2023.12.02
크롤링(Crawling)  (0) 2023.12.02