firefox
mozilla geckodriver(https://github.com/mozilla/geckodriver/releases)
python3.x
使い方:
13行目:amazonAccount="account"
14行目:amazonPassword="password"
を個人のアカウントとパスワードに差し替え。
118行目:
years = range(2009, 2020)
← 抽出したい年を
outputエクセル:
A列:注文番号
C列:購入当時実際に支払った金額(送料など込み)
E列(checkPrice、送料別途)=D列明細内の各商品単価*数量
E列とC列合致しない場合、下記の原因が考えられる:
・送料込みか別途発生する
・商品単価の変動
スクリーンショット:
import os import re import math import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.support.ui import WebDriverWait import requests from tqdm import tqdm import time amazonAccount = "account" amazonPassword = "password" thisPath = os.path.dirname(os.path.abspath(__file__)) orders = {} # 注文明細一覧 invisibleList = [] # 4つ以上の注文の場合、一旦URLを格納 options = Options() options.add_argument('-headless') browser = webdriver.Firefox(executable_path=os.path.join( thisPath, "geckodriver"), firefox_options=options) # login browser.get('https://www.amazon.co.jp/ap/signin?ie=UTF8&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.co.jp%2F%3Fref_%3Dnav_signin&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=jpflex&_encoding=UTF8&openid.mode=checkid_setup&openid.ns.pape=http%3A%2F%2Fspecs.openid.net%2Fextensions%2Fpape%2F1.0&ignoreAuthState=1&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&ie=UTF8&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&fromAuthPrompt=1') browser.find_element_by_name("email").send_keys(amazonAccount) browser.find_element_by_name("password").send_keys(amazonPassword) browser.find_element_by_name("rememberMe").click() WebDriverWait(browser, 10).until(lambda x: x.find_element_by_id("signInSubmit")).click() # 注文履歴page browser.get('https://www.amazon.co.jp/gp/your-account/order-history') print(browser.title) s = requests.Session() headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"} for cookie in browser.get_cookies(): s.cookies.set(cookie['name'], cookie['value']) s.headers.clear() def getProductionNameAndNum(string): tmp = string.split("、数量:") if len(tmp) == 1: return [tmp[0], 1] else: return [tmp[0], int(tmp[1])] def getSummary(arr): a = list(map(lambda x: re.sub(r'[\n\s]', '', x.text), arr)) a[1] = int(re.sub(r'[,\s¥]', '', a[1])) return a # [orderDate,price,orderNumber] def getDetails(elements): results = [] for e in elements: appFlag = "shipment" not in e.attrs['class'] for d in e.select('.a-fixed-left-grid-col.a-col-right'): if appFlag: results.append({ '商品名': re.sub(r'^[\s\n]+|[\s\n]+$', '', d.select(".a-row")[0].text), '数量': 1, '単価': 0}) break tmp = getProductionNameAndNum(d.find(class_="a-link-normal").text) results.append({ '商品名': re.sub(r'^[\s\n]+|[\s\n]+$', '', tmp[0]), '数量': tmp[1], '単価': int(re.sub(r'[,\s\n¥]', '', d.select(".a-size-small.a-color-price")[0].text)) }) checkPrice = sum(list(map(lambda x: x['単価'] * x['数量'], results))) return [results, checkPrice] def parseHtml(html): orderHtmls = html.select(".a-box-group.a-spacing-base.order") # all orders(max=10) for o in orderHtmls: # header headerHtml = o.select('.a-box.a-color-offset-background.order-info')[0] header = getSummary(headerHtml.select('.a-color-secondary.value')) orderNumber = header[2] orders[orderNumber] = {'注文日': header[0], '合計': header[1]} # "**個すべての商品を表示"の有無を確認。ありの場合は独自のページで invisible = o.select('.a-size-medium.a-link-emphasis') if len(invisible): invisibleList.append({"orderNumber": orderNumber, "url": invisible[0].get("href")}) continue # details if o.select('.a-box.shipment'): # production detailBox = o.select('.a-box.shipment') else: # android app # 厳密で div class="a-box" detailBox = o.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['a-box']) d = getDetails(detailBox) orders[orderNumber]['明細'] = d[0] orders[orderNumber]['checkPrice'] = d[1] def makeOrderUrls(orderNum, year): year = str(year) urls = [] pages = math.ceil(orderNum / 10) baseUrl = "https://www.amazon.co.jp/gp/your-account/order-history/ref=ppx_yo_dt_b_pagination_1_{}?ie=UTF8&orderFilter=year-{}&search=&startIndex={}" for p in range(0, pages): urls.append(baseUrl.format(p + 1, year, p * 10)) return urls years = range(2009, 2020) # order period for y in tqdm(years): first_page_url = "https://www.amazon.co.jp/gp/your-account/order-history/ref=ppx_yo_dt_b_pagination_1_1?ie=UTF8&orderFilter=year-{}&search=&startIndex=0".format(str(y)) html = BeautifulSoup(s.get(first_page_url, headers=headers).text, "lxml") num_orders = int(html.find(class_='num-orders').text.replace("件", "")) #print("year:{}, order numbers: {}".format(y,num_orders)) if num_orders == 0: continue page_urls = makeOrderUrls(num_orders, y) parseHtml(html) # 1頁目 time.sleep(1) for i in tqdm(range(1, len(page_urls))): # 2頁以降 # print(page_urls[i]) html = BeautifulSoup(s.get(page_urls[i], headers=headers).text, "lxml") parseHtml(html) time.sleep(1) # 4つ以上の商品は、独自のページで検索 for x in invisibleList: r = s.get("https://www.amazon.co.jp" + x['url'], headers=headers) html = BeautifulSoup(r.text, "lxml") d = getDetails(html.select('.a-box.shipment')) orders[x['orderNumber']]['明細'] = d[0] orders[x['orderNumber']]['checkPrice'] = d[1] time.sleep(1) browser.quit() df = pd.DataFrame.from_dict(orders, orient="index") writer = pd.ExcelWriter("output.xlsx") df.to_excel(writer, index=True, header=True, sheet_name="orders") writer.save()