Published Date : 2019年10月22日1:08
架空のお仕事をしてみる企画(3-5)です。
It's a project to try a fictitious job(3-5).
仮に自分がフリーランスで、 ある依頼者から適当な仕事を振られてみたら、 果たしてクリアできるのか?といった企画。
If i am a freelance programmer, When a client assigns a suitable job, Can I clear it? That's the plan.
この企画は架空のものですが、日本のクラウドソーシング市場に氾濫しているよくある案件と値段と工数を参考にしてます。
This project is a fictitious one, but it is based on common cases, prices and man-hours flooding the Japanese crowdsourcing market.
Collector4,5,6を作成。それにまた新しい機能を追加し、多少の変更を加えました。
Created Collector3,4,5. I've also added new features and made some changes.
ファイル構成は以下の通り
The file structure is as follows
prototype002 collectors firstCollector.py secondCollector.py thirdCollector.py fourthCollector.py fifthCollector.py sixthCollector.py data first_collection settings.csv second_collection settings.csv third_collection settings.csv fourth_collection settings.csv fifth_collection settings.csv sixth_collection settings.csv scripts collectors_utils.py chromedriver.exe crowler.py
全体コードとほんのちょっと解説
General code and brief description.
# wonderful spam! from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys # from selenium.webdriver.support.ui import Select import csv import time from datetime import datetime import os,sys ## import re sys.path.append('./scripts') import collectors_utils # if you want to use choromedriver_binary # import chromedriver_binary # chromedriver_binary.add_chromedriver_to_path() class FourthCollector(): def __init__(self, url, data_dir): self.base_url = url # self.data_dir = data_dir ## def close_modal(self, driver): # try: # WebDriverWait(driver,6).until(EC.visibility_of_element_located((By.XPATH,'//img[@id="ads-banner-img"]'))) # except: # driver.find_element_by_xpath('//div[@id="cv-tech-modal-close"]').click() try: driver.find_element_by_xpath('//div[@id="cv-tech-modal-close"]').click() except: pass ## def page_feed(self,driver, end_date, current_date, y,m,d): ## if f'{y}/{m}/{d}' == end_date: return True else: ## current_date.find_element_by_xpath('input[@id="search_tomorrow"]').click() return False def fetch_table_header(self, driver): header_element = driver.find_elements_by_xpath('//tr[@id="search_head"]/th') table_header = [e.text for e in header_element if e.text != ''] return table_header def fetch_table_contents(self, driver, table_header): try: WebDriverWait(driver,6).until(EC.visibility_of_element_located((By.XPATH,'//div[@id="fare_list_new"]'))) except: print('empty contents') time.sleep(3) # See if it's empty content_elements = driver.find_elements_by_xpath('//div[@id="fare_list_new"]/table/tbody') if content_elements != []: table_contents = [] for c in content_elements: temp_dict = {} for idx,p in enumerate(c.find_elements_by_xpath('./tr/td')): temp_dict[table_header[idx]] = p.text.replace('\n','->') if p.text.replace('\n','->') == '': pass else: table_contents.append(temp_dict) return table_contents else: return None def fetch_select_options(self, element): select_element = Select(element) select_options_dic = {option.text:option.get_attribute('value') for option in select_element.options if option.text != '出発地'} return select_element, select_options_dic # def set_dep_date(self, start, dep_date, driver): start = start.split('/') # set year year = driver.find_element_by_xpath('//input[@id="departure_date[y]"]') driver.execute_script(f'arguments[0].setAttribute("value", "{start[0]}")', year) # set month month = driver.find_element_by_xpath('//input[@id="departure_date[m]"]') driver.execute_script(f'arguments[0].setAttribute("value", "{start[1]}")', month) # day day = driver.find_element_by_xpath('//input[@id="departure_date[d]"]') driver.execute_script(f'arguments[0].setAttribute("value", "{start[2]}")', day) ## def collect_price(self, driver, current_url): # Load a file with specified conditions. rows = collectors_utils.read_file(self.data_dir) for row in rows: try: WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH,'//select[@name="departure_airport_id"]'))) except: print('element invisible') driver.quit() # Enterring the data of the imported file. dep = row['dep'] des = row['des'] # Departure locations dep_box = driver.find_element_by_xpath('//select[@name="departure_airport_id"]') dep_box, select_options_dic = self.fetch_select_options(dep_box) dep_box.select_by_value(select_options_dic[dep]) # Destination locations des_box = driver.find_element_by_xpath('//select[@name="arrival_airport_id"]') des_box, select_options_dic = self.fetch_select_options(des_box) des_box.select_by_value(select_options_dic[des]) ## dep-date dep_date = driver.find_element_by_xpath(f'//input[@id="datePicker"]') # set date start = row['start'] end = row['end'] start_date_check = collectors_utils.check_date(start) end_date_check = collectors_utils.check_date(end) if start_date_check: start = start.replace('/','/') else: start = collectors_utils.get_date()[0] if end_date_check: end = end.replace('/','/') else: end = collectors_utils.get_date()[1] ## enter departure date self.set_dep_date(start, dep_date, driver) # click search button driver.find_element_by_xpath('//button[@id="js-btnSearchTicket"]').click() # fetch table header table_header = self.fetch_table_header(driver) date_obj = datetime.now() dir_date = datetime.strftime(date_obj,'%Y%m%d') ## dir_name = f'片道-{dep.replace("/","")}-{des.replace("/","")}-{dir_date}' if os.path.exists(f'{self.data_dir}/{dir_name}'): pass else: os.mkdir(f'{self.data_dir}/{dir_name}') while True: current_date = driver.find_element_by_xpath('//p[@class="flightselect-cont"]') current_date_text = current_date.text y, m, d = current_date_text.split('/') endsplit = end.split('/') file_name = f"{self.data_dir}/{dir_name}/{m}-{d}_{endsplit[1]}-{endsplit[2]}" table_contents = self.fetch_table_contents(driver, table_header) if table_contents is not None: ## if self.page_feed(driver, end, current_date, y,m,d): ## break else: collectors_utils.write_file(file_name, table_header, table_contents) else: ## if self.page_feed(driver, end, current_date, y,m,d): break else: pass driver.get(current_url) driver.quit() def main(self): driver = collectors_utils.set_web_driver(self.base_url) current_url = driver.current_url ## self.collect_price(driver, current_url)
# wonderful spam! from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys # from selenium.webdriver.support.ui import Select import csv import time from datetime import datetime import os,sys ## import re sys.path.append('./scripts') import collectors_utils # if you want to use choromedriver_binary # import chromedriver_binary # chromedriver_binary.add_chromedriver_to_path() class FifthCollector(): def __init__(self, url, data_dir): self.base_url = url # self.data_dir = data_dir ## def close_modal(self, driver): # try: # WebDriverWait(driver,6).until(EC.visibility_of_element_located((By.XPATH,'//img[@id="ads-banner-img"]'))) # except: # driver.find_element_by_xpath('//div[@id="cv-tech-modal-close"]').click() try: driver.find_element_by_xpath('//div[@id="cv-tech-modal-close"]').click() except: pass ## def page_feed(self,driver, end_date, current_date, m,d): ## if m == end_date.split('/')[1] and d == end_date.split('/')[2]: return True else: ## current_button = driver.find_element_by_xpath('//div[@class="lowest_price_list_cell tx-c active selectable"]') next_button = current_button.find_element_by_xpath('following-sibling::div') next_button.click() return False def fetch_table_header(self, driver): header_element = driver.find_elements_by_xpath('//div[@class="flight-table-header border-1 border-gray-thin box-bb"]/div') table_header = [e.text for e in header_element if e.text != ''] return table_header def fetch_table_contents(self, driver, table_header): try: WebDriverWait(driver,6).until(EC.visibility_of_element_located((By.XPATH,'//table[@class="flight-table"]'))) except: print('empty contents') time.sleep(3) # See if it's empty content_elements = driver.find_elements_by_xpath('//table[@class="flight-table"]/tbody/tr') if content_elements != []: table_contents = [] for c in content_elements: temp_dict = {} for idx,p in enumerate(c.find_elements_by_xpath('td')): temp_dict[table_header[idx]] = p.text.replace('\n','->') if p.text.replace('\n','->') == '': pass else: table_contents.append(temp_dict) return table_contents else: return None def fetch_select_options(self, element): select_element = Select(element) select_options_dic = {option.text:option.get_attribute('value') for option in select_element.options if option.text != '出発地'} return select_element, select_options_dic # def set_dep_date(self, start, dep_date, driver): start = start.split('/') # set year year = driver.find_element_by_xpath('//input[@id="departure_date[y]"]') driver.execute_script(f'arguments[0].setAttribute("value", "{start[0]}")', year) # set month month = driver.find_element_by_xpath('//input[@id="departure_date[m]"]') driver.execute_script(f'arguments[0].setAttribute("value", "{start[1]}")', month) # day day = driver.find_element_by_xpath('//input[@id="departure_date[d]"]') driver.execute_script(f'arguments[0].setAttribute("value", "{start[2]}")', day) ## def collect_price(self, driver, current_url): # Load a file with specified conditions. rows = collectors_utils.read_file(self.data_dir) for row in rows: try: WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH,'//select[@class="pl10 departure"]'))) except: print('element invisible') driver.quit() # Enterring the data of the imported file. dep = row['dep'] des = row['des'] # Departure locations dep_box = driver.find_element_by_xpath('//select[@class="pl10 departure"]') dep_box, select_options_dic = self.fetch_select_options(dep_box) dep_box.select_by_value(select_options_dic[dep]) # Destination locations des_box = driver.find_element_by_xpath('//select[@class="pl10 arrive"]') des_box, select_options_dic = self.fetch_select_options(des_box) des_box.select_by_value(select_options_dic[des]) ## dep-date dep_date = driver.find_element_by_xpath(f'//input[@aria-labelledby="aria-label-departure-date"]') # set date start = row['start'] end = row['end'] start_date_check = collectors_utils.check_date(start) end_date_check = collectors_utils.check_date(end) if start_date_check: start = start.replace('/','/') else: start = collectors_utils.get_date()[0] if end_date_check: end = end.replace('/','/') else: end = collectors_utils.get_date()[1] ## enter departure date #self.set_dep_date(start, dep_date, driver) search_date = start.replace("/",'/') driver.execute_script(f'arguments[0].setAttribute("value","{search_date}")', dep_date) # click search button driver.find_element_by_xpath('//div[@class="decide-btn btn-grad-orange"]').click() # fetch table header table_header = self.fetch_table_header(driver) date_obj = datetime.now() dir_date = datetime.strftime(date_obj,'%Y%m%d') ## dir_name = f'片道-{dep.replace("/","")}-{des.replace("/","")}-{dir_date}' if os.path.exists(f'{self.data_dir}/{dir_name}'): pass else: os.mkdir(f'{self.data_dir}/{dir_name}') while True: current_date = driver.find_element_by_xpath('//div[@class="lowest_price_list_cell tx-c active selectable"]/p[1]') current_date_text = current_date.text m, d, _ = current_date_text.replace('(','/').split('/') endsplit = end.split('/') file_name = f"{self.data_dir}/{dir_name}/{m}-{d}_{endsplit[1]}-{endsplit[2]}" table_contents = self.fetch_table_contents(driver, table_header) if table_contents is not None: ## if self.page_feed(driver, end, current_date, m,d): ## break else: collectors_utils.write_file(file_name, table_header, table_contents) else: ## if self.page_feed(driver, end, current_date, m,d): break else: pass driver.get(current_url) driver.quit() def main(self): driver = collectors_utils.set_web_driver(self.base_url) current_url = driver.current_url ## self.collect_price(driver, current_url)
# wonderful spam! from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys # from selenium.webdriver.support.ui import Select import csv import time from datetime import datetime import os,sys ## import re sys.path.append('./scripts') import collectors_utils # if you want to use choromedriver_binary # import chromedriver_binary # chromedriver_binary.add_chromedriver_to_path() class SixthCollector(): def __init__(self, url, data_dir): self.base_url = url # self.data_dir = data_dir ## def airticket(self, driver): try: WebDriverWait(driver,6).until(EC.visibility_of_element_located((By.XPATH,'//button[@id="tab-flight-tab-hp"]'))) except: time.sleep(1) driver.find_element_by_xpath('//button[@id="tab-flight-tab-hp"]').click() time.sleep(1) driver.find_element_by_xpath('//label[@id="flight-type-one-way-label-hp-flight"]').click() time.sleep(1) driver.find_element_by_xpath('//a[@id="primary-header-flight"]').click() def close_modal(self, driver): # try: # WebDriverWait(driver,6).until(EC.visibility_of_element_located((By.XPATH,'//img[@id="ads-banner-img"]'))) # except: # driver.find_element_by_xpath('//div[@id="cv-tech-modal-close"]').click() try: driver.find_element_by_xpath('//div[@id="cv-tech-modal-close"]').click() except: pass ## def page_feed(self,driver, end_date, current_date, m,d): ## #driver.back() if m == end_date.split('/')[1] and d == end_date.split('/')[2]: return True else: ## current_button = driver.find_element_by_xpath('//div[@class="lowest_price_list_cell tx-c active selectable"]') next_button = current_button.find_element_by_xpath('following-sibling::div') next_button.click() return False def fetch_table_header(self, driver): header_element = driver.find_elements_by_xpath('//div[@id="flight-listing-container"]') table_header = [e.text for e in header_element if e.text != ''] return table_header def fetch_table_contents(self, driver, table_header): try: WebDriverWait(driver,6).until(EC.visibility_of_element_located((By.XPATH,'//li[@data-test-id="offer-listing"]'))) except: print('empty contents') time.sleep(3) # See if it's empty content_elements = driver.find_elements_by_xpath('//li[@data-test-id="offer-listing"]') if content_elements != []: table_contents = [] for c in content_elements: temp_dict = {} for idx,p in enumerate(c.text.split('\n')): temp_dict[table_header[idx]] = p table_contents.append(temp_dict) return table_contents else: return None def fetch_select_options(self, element): select_element = Select(element) select_options_dic = {option.text:option.get_attribute('value') for option in select_element.options if option.text != '出発地'} return select_element, select_options_dic # def set_dep_date(self, start, dep_date, driver): start = start.split('/') # set year year = driver.find_element_by_xpath('//input[@id="departure_date[y]"]') driver.execute_script(f'arguments[0].setAttribute("value", "{start[0]}")', year) # set month month = driver.find_element_by_xpath('//input[@id="departure_date[m]"]') driver.execute_script(f'arguments[0].setAttribute("value", "{start[1]}")', month) # day day = driver.find_element_by_xpath('//input[@id="departure_date[d]"]') driver.execute_script(f'arguments[0].setAttribute("value", "{start[2]}")', day) ## def collect_price(self, driver, current_url): # Load a file with specified conditions. rows = collectors_utils.read_file(self.data_dir) for row in rows: try: WebDriverWait(driver,30).until(EC.visibility_of_element_located((By.XPATH,'//*[@id="flight-origin-hp-flight"]'))) except: print('element invisible') # Enterring the data of the imported file. dep = row['dep'] des = row['des'] # Departure locations dep_box = driver.find_element_by_xpath('//*[@id="flight-origin-hp-flight"]') dep_box.clear() dep_box.send_keys(dep) # Destination locations des_box = driver.find_element_by_xpath('//*[@id="flight-destination-hp-flight"]') des_box.clear() des_box.send_keys(des) ## dep-date dep_date = driver.find_element_by_xpath(f'//input[@id="flight-departing-single-flp"]') # set date start = row['start'] end = row['end'] start_date_check = collectors_utils.check_date(start) end_date_check = collectors_utils.check_date(end) if start_date_check: start = start.replace('/','/') else: start = collectors_utils.get_date()[0] if end_date_check: end = end.replace('/','/') else: end = collectors_utils.get_date()[1] ## enter departure date #self.set_dep_date(start, dep_date, driver) search_date = start.replace("/",'/') dep_date.clear() dep_date.send_keys(search_date) # click search button search_btn = driver.find_element_by_xpath('//label[@class="col search-btn-col"]/button') driver.execute_script('arguments[0].click()', search_btn) # fetch table header #table_header = self.fetch_table_header(driver) table_header = [i+1 for i in range(19)] date_obj = datetime.now() dir_date = datetime.strftime(date_obj,'%Y%m%d') ## dir_name = f'片道-{dep.replace("/","")}-{des.replace("/","")}-{dir_date}' if os.path.exists(f'{self.data_dir}/{dir_name}'): pass else: os.mkdir(f'{self.data_dir}/{dir_name}') while True: current_date = driver.find_element_by_xpath('//div[@class="lowest_price_list_cell tx-c active selectable"]/p[1]') current_date_text = current_date.text m, d, _ = current_date_text.replace('(','/').split('/') endsplit = end.split('/') file_name = f"{self.data_dir}/{dir_name}/{m}-{d}_{endsplit[1]}-{endsplit[2]}" table_contents = self.fetch_table_contents(driver, table_header) if table_contents is not None: ## if self.page_feed(driver, end, current_date, m,d): ## break else: collectors_utils.write_file(file_name, table_header, table_contents) else: ## if self.page_feed(driver, end, current_date, m,d): break else: pass driver.get(current_url) driver.quit() def main(self): driver = collectors_utils.set_web_driver(self.base_url) self.airticket(driver) current_url = driver.current_url ## self.collect_price(driver, current_url)
説明といっても殆どが既存クラスのコピーで、多少の時刻取得やデータ加工が変わっているのみです。 物凄く簡単に一気に説明しているのは、一応期限が5日と(勝手に決めていたので)あと数時間だからです。
Most of the descriptions are copies of existing classes, with a few changes in time acquisition and data processing. I explain it very simply because the deadline is 5 days and (I decided on my own.) just a few more hours.
後は、crawler.pyに付け足していけば完成です。
Then add it to crawler.py and you're done.
import argparse import time import sys sys.path.append('.') from collectors.firstCollector import FirstCollector from collectors.secondCollector import SecondCollector from collectors.thirdCollector import ThirdCollector from collectors.fourthCollector import FourthCollector from collectors.fifthCollector import FifthCollector from collectors.sixthCollector import SixthCollector def collectors(num1, num2, num3, num4, num5, num6): if num1 is not None: if num1 != 0 and num1 != 1: print('Please make it 0 or 1 for now.') sys.exit() else: start_time = time.time() first_collector = FirstCollector('https://woderfulspam.spam', num1, 'data/first_collection') first_collector.main() print(f'The first collector completed the collection in {round(time.time() - start_time)} sec') else: pass if num2 is not None: if num2 != 0 and num2 != 1: print('Please make it 0 or 1 for now.') sys.exit() else: start_time = time.time() second_collector = SecondCollector('https://spam.lovelyspam', num2, 'data/second_collection') second_collector.main() print(f'The second collector completed the collection in {round(time.time() - start_time)} sec') else: pass if num3 is not None: if num3 != 0: print('Please make it 0 for now.') sys.exit() else: start_time = time.time() third_collector = ThirdCollector('https://egg.spambacon.spam', 'data/third_collection') third_collector.main() print(f'The third collector completed the collection in {round(time.time() - start_time)} sec') else: pass if num4 is not None: if num4 != 0: print('Please make it 0 for now.') sys.exit() else: start_time = time.time() fourth_collector = FourthCollector('https://egg.spambacon.spam', 'data/fourth_collection') fourth_collector.main() print(f'The fourth collector completed the collection in {round(time.time() - start_time)} sec') else: pass if num5 is not None: if num5 != 0: print('Please make it 0 for now.') sys.exit() else: start_time = time.time() fifth_collector = FifthCollector('https://egg.spambacon.spam', 'data/fifth_collection') fifth_collector.main() print(f'The fifth collector completed the collection in {round(time.time() - start_time)} sec') else: pass if num6 is not None: if num6 != 0: print('Please make it 0 for now.') sys.exit() else: start_time = time.time() sixth_collector = SixthCollector('https://egg.spambacon.spam', 'data/sixth_collection') sixth_collector.main() print(f'The sixth collector completed the collection in {round(time.time() - start_time)} sec') else: pass def crawler_py(): parser = argparse.ArgumentParser() first_collector_usage = """ This argument crawls the site for the specified https://woderfulspam.spam. [-c1, --c1 Number] Number: Choose between (round trip -> 0) and (one way -> 1). """ second_collector_usage = """ This argument crawls the site for the specified https://spam.lovelyspam. [-c2, --c2 Number] Number: Choose between (round trip -> 1) and (one way -> 0). """ third_collector_usage = """ This argument crawls the site for the specified https://egg.spambacon.spam. [-c3, --c3 Number] Number: Specifying 0 causes C3 to scrape """ fourth_collector_usage = """ This argument crawls the site for the specified https://egg.spambacon.spam. [-c4, --c4 Number] Number: Specifying 0 causes C4 to scrape """ fifth_collector_usage = """ This argument crawls the site for the specified https://egg.spambacon.spam. [-c5, --c5 Number] Number: Specifying 0 causes C5 to scrape """ sixth_collector_usage = """ This argument crawls the site for the specified https://egg.spambacon.spam. [-c6, --c6 Number] Number: Specifying 0 causes C6 to scrape """ parser.add_argument('-c1','--c1', type=int, help=first_collector_usage) parser.add_argument('-c2','--c2', type=int, help=second_collector_usage) parser.add_argument('-c3','--c3', type=int, help=third_collector_usage) parser.add_argument('-c4','--c4', type=int, help=fourth_collector_usage) parser.add_argument('-c5','--c5', type=int, help=fifth_collector_usage) parser.add_argument('-c6','--c6', type=int, help=sixth_collector_usage) args = parser.parse_args() collectors(args.c1, args.c2, args.c3, args.c4, args.c5, args.c6) if __name__=='__main__': crawler_py()
準備ができたら、crawler.pyを動かす。
When ready, run crawler.py
python crawler.py -c6 0
sixthコレクターのみを動かす。
Activates only the sixthCollector.
settings.csvの中身。
The content of settings.py.
dep,des,start,end 札幌,東京,, 東京,札幌,2019/11/21,2019/12/20
取れたデータの中身は以下のようになる。
Here's what the data looks like.
取り敢えず、かろうじて動かせるクローラーとデータは納品できました。 なんにせよ終わらせることが大事なんです。 後はのんびり統合作業をしていきます。
For the time being, we were able to deliver crawlers and data that could barely started. In any case, it's important to finish it. After that, I will do the integration work leisurely.