小言_互联网的博客

python_量化交易_舆情分析_百度情绪分析_分析股票利好利空比例

323人阅读  评论(0)

1、摘要

本文主要内容:使用百度情绪分析接口评估股票近半年的新闻,评估新闻属于利好还是利空,最终统计利好和利空的比例,供选股做参考
本文福利:赠送百度AppID:应用的唯一标识AppKey:公匙(相当于账号)AppSecret:私匙(相当于密码)

2、主要思路

  1. 选择自己要评估的股票代码数组
  2. 金融界行情中心获取股票新闻信息
  3. 得到页面的内容并保存
  4. 调用百度云自然语言处理接口,进行情感倾向分析
  5. 统计利好和利空的比例

3、代码

import os
import re

import lxml  # 一个Python库,使用它可以轻松处理XML和HTML文件,还可以用于web爬取
import requests
from aip import AipNlp
from lxml import etree
from matplotlib import pyplot

from utils.read_write import readTxt, writeOneCsv, readToStr

os.chdir(r'D:\项目\量化交易')
pyplot.rcParams['font.sans-serif'] = ['SimHei']  # 图形参数设置,用来正常显示中文标签,国标黑体
pyplot.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号


# 得到最大页码的数值
def getallpage(url):
    pagedata = requests.get(url).content.decode("gbk")
    # print(pagedata)
    # lxml教程:https://www.cnblogs.com/zhangxinqi/p/9210211.html#_label2
    mytree = lxml.etree.HTML(pagedata)
    # 取所有的页码数
    if pagedata.find("page_newslib"):
        data = mytree.xpath("//*[@class=\"page_newslib\"]//a[last()-1]/text()")
        return data
    else:
        return ['1']


# 得到页面的内容并保存
def everypagecontent(url, number):
    # 解决服务器延时的问题try
    try:
        pagedata = requests.get(url).content.decode("gbk")
        mytree = lxml.etree.HTML(pagedata)
        # 取所有的内容
        datas = mytree.xpath("//*[@class = \"newlist\"]//li/span/a/text()")
        for data in datas:
            data = data + "\r\n"
            with open(number + ".txt", "a") as file:
                file.write(data)
                file.flush()
        return datas
    except:
        print("服务器超时")


# 调用百度云自然语言处理接口,进行情感倾向分析
def analyze(number):
    """ 你的 APPID AK SK """
    # AppID:应用的唯一标识AppKey:公匙(相当于账号)AppSecret:私匙(相当于密码)
    APP_ID = '22793513'
    API_KEY = '0veui3PVDnoXSc3ZmGNI3Et5'
    SECRET_KEY = 'BGvXXlPQKigySi0MqG26i17bfnRPl8wy'
    pos = 0
    nav = 0
    avgpos = 0
    navavg = 0
    i = 0
    aipNlp = AipNlp(APP_ID, API_KEY, SECRET_KEY)
    lines = readTxt(number + ".txt")
    str = readToStr(number + ".txt")
    companys = re.findall(r"\[(.+?)\]", str)
    company = max(companys, key=companys.count)
    for line in lines:
        aline = line.replace("\r\n", "").strip()
        if len(aline) != 0:
            try:
                result = aipNlp.sentimentClassify(aline)  # 调用百度接口
                positive = result['items'][0]['positive_prob']
                nagative = result['items'][0]['negative_prob']
                i += 1
                if positive >= nagative:
                    pos += 1
                else:
                    nav += 1
                avgpos = pos / i
                navavg = nav / i
            except Exception as e:
                pass
    print(company)
    print(avgpos)
    print(navavg)
    writeOneCsv([number, company, avgpos, navavg], '各种股票的情绪.csv')


def getpageurl(url):
    pagenumber = getallpage(url)[0]
    for i in range(1, int(pagenumber) + 1):
        try:
            if i == 1:
                url = "http://stock.jrj.com.cn/share," + number + ",ggxw.shtml"
            else:
                url = "http://stock.jrj.com.cn/share," + number + ",ggxw_" + str(i) + ".shtml"
        except:
            pass
        everypagecontent(url, number)


numbers = ['600519', '000661', '002241', '300677', '000860', '300750', '600436', '603939', '300347', '603429']

# 得到股票名称,获取新闻信息生成文件
for number in numbers:
    print(number)
    url = "http://stock.jrj.com.cn/share," + number + ",ggxw.shtml"
    getpageurl(url)

# 读取文件
for number in numbers:
    print(number)
    analyze(number)


转载:https://blog.csdn.net/qq_30803353/article/details/114791758
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场