随着大数据技术的快速发展,企业对大数据人才的需求日益增长。了解当前市场对大数据岗位的技能要求,可以帮助求职者精准提升技能,也能为企业招聘提供数据支持。
本文介绍如何利用 Python爬虫 从招聘网站(如拉勾网、智联招聘)抓取大数据相关岗位信息,并采用 自然语言处理(NLP) 技术对岗位描述(JD)进行关键词提取和技能分析,最终生成可视化报告。
requests
+ BeautifulSoup
或 Scrapy
爬取招聘网站的大数据岗位信息。Pandas
进行数据预处理,去除无效数据。jieba
分词 + TF-IDF
或 TextRank
提取关键技能词。Matplotlib
或 WordCloud
生成技能关键词云图。以 拉勾网 为例,爬取大数据相关岗位信息(需模拟浏览器请求,避免反爬)。
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
# 代理配置
proxyHost = "www.16yun.cn"
proxyPort = "5445"
proxyUser = "16QMSOML"
proxyPass = "280651"
# 代理设置
proxyMeta = f"http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}"
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Proxy-Authorization": f"Basic {proxyUser}:{proxyPass}" # 部分代理可能需要此头部
}
def fetch_jobs(keyword="大数据", page=1):
url = f"https://www.lagou.com/jobs/list_{keyword}/p-{page}?&filterOption=3"
try:
response = requests.get(url, headers=headers, proxies=proxies, timeout=10)
response.raise_for_status() # 检查请求是否成功
soup = BeautifulSoup(response.text, 'html.parser')
jobs = []
for item in soup.select(".item__10RTO"):
title = item.select_one(".p-top__1F7CL a").text.strip()
company = item.select_one(".company-name__2-SjF").text.strip()
salary = item.select_one(".money__3Lkgq").text.strip()
jd = item.select_one(".job-desc__3UqDp").text.strip()
jobs.append({
"title": title,
"company": company,
"salary": salary,
"jd": jd
})
return jobs
except Exception as e:
print(f"请求失败: {e}")
return []
# 爬取5页数据
all_jobs = []
for page in range(1, 6):
print(f"正在爬取第 {page} 页...")
jobs = fetch_jobs(page=page)
if jobs: # 只有成功获取数据时才添加
all_jobs.extend(jobs)
time.sleep(5) # 增加延迟,降低被封风险
# 存储为CSV
if all_jobs: # 检查是否有数据
df = pd.DataFrame(all_jobs)
df.to_csv("big_data_jobs.csv", index=False)
print(f"数据爬取完成!共获取 {len(all_jobs)} 条数据。")
else:
print("未能获取任何数据,请检查网络或代理设置。")
去除重复、缺失值,并提取关键字段:
import pandas as pd
df = pd.read_csv("big_data_jobs.csv")
df.drop_duplicates(inplace=True) # 去重
df.dropna(inplace=True) # 去除空值
print(f"有效岗位数: {len(df)}")
jieba
分词 + TF-IDF
提取技能词import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
# 加载停用词
stopwords = set()
with open("stopwords.txt", "r", encoding="utf-8") as f:
for line in f:
stopwords.add(line.strip())
# 分词函数
def cut_text(text):
words = jieba.lcut(text)
return [word for word in words if word not in stopwords and len(word) > 1]
# 对所有JD进行分词
corpus = df["jd"].tolist()
words_list = [" ".join(cut_text(jd)) for jd in corpus]
# 计算TF-IDF
tfidf = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf.fit_transform(words_list)
feature_names = tfidf.get_feature_names_out()
# 获取最重要的技能词
word_importance = {}
for i, word in enumerate(feature_names):
word_importance[word] = tfidf_matrix[:, i].sum()
sorted_skills = sorted(word_importance.items(), key=lambda x: x[1], reverse=True)[:20]
print("Top 20 技能关键词:")
for skill, score in sorted_skills:
print(f"{skill}: {score:.2f}")
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 生成词云
word_freq = {k: v for k, v in word_importance.items() if v > 0.1}
wc = WordCloud(font_path="simhei.ttf", background_color="white", width=800, height=600)
wc.generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("大数据岗位技能关键词云图", fontsize=16)
plt.show()
未来可扩展方向:
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。