本文共 4776 字,大约阅读时间需要 15 分钟。
???????????????????????????Python????????????????????????????????????????????????????????Python?????????Python????????????????Excel????
????????????????????????????????????????????????????????????????????Python??????????????????????Python???????????Excel????????????????
??????????????requests?????HTTP?????????????????????????????????
user-agent??????????json()???????????????????????Excel????????xlsxwriter?????????????????????xlsxwriter??????
xlsxwriter???pip install xlsxwriter
???????????xlsxwriter???
??Excel?????Workbook???Excel???????????Worksheet?
???????write()???????Excel???????????Excel???????0???
??????????????????
??????????????????????????????????????????????????????????????????????
????????Excel???????????????MySQL????????????????
??????????????????requests?pymysql???????????Python???????????MySQL?????
import requestsimport timefrom multiprocessing import Poolfrom pymysql import *# ??????????MySQLdef get_json(index): url = "https://study.163.com/p/search/studycourse.json" payload = { "pageSize": 50, "pageIndex": index, "relativeOffset": 0, "searchTimeType": -1, "orderType": 5, "priceType": -1, "activityId": 0, "qualityType": 0, "keyword": "python" } headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36", "accept": "application/json", "content-type": "application/json", "origin": "https://study.163.com" } response = requests.post(url, json=payload, headers=headers) if response.status_code == 200: content_json = response.json() if content_json and content_json["message"] == "ok": return content_json return Nonedef get_content(content_json): if "result" in content_json: return content_json["result"]["list"] return []def check_course_exit(course_id): sql = f"select course_id from course where course_id = {course_id}" cs1.execute(sql) course = cs1.fetchone() if course: return True else: return Falsedef save_to_course(course_data): sql_course = """insert into course values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ cs1.executemany(sql_course, course_data)def save_mysql(content): course_data = [] for item in content: if not check_course_exit(item['courseId']): course_value = ( item['courseId'], item['productId'], item['productType'], item['productName'], item['provider'], item['score'], item['scoreLevel'], item['learnerCount'], item['lessonCount'], item['lectorName'], item['originalPrice'], item['discountPrice'], item['discountRate'], item['imgUrl'], item['bigImgUrl'], item['description'] ) course_data.append(course_value) save_to_course(course_data)def main(index): content_json = get_json(index) content = get_content(content_json) save_mysql(content)if __name__ == '__main__': conn = connect(host="localhost", port=3306, database="wyy_spider", user="root", password="mysql", charset="utf8") cs1 = conn.cursor() print("*******************????*******************") start = time.time() total_page_count = get_json(1)['result']["query"]["totlePageCount"] pool = Pool() index_list = [i for i in range(total_page_count)] pool.map(main, index_list) pool.close() pool.join() conn.commit() cs1.close() conn.close() print("????") end = time.time() print(f"???????{end - start}?") print("*******************????*******************") save_to_course????????????????????????????MySQL????????????????course????????????
???????????????????????????????Python????????Excel?MySQL???????????????????????????????????????????????????????????????????
转载地址:http://blvk.baihongyu.com/