python数据分析入门,作为入门文章系列主要包含以下几个内容:

1.数据的来源(本案例采用的数据来自于上一篇文章中爬取的智联招聘信息):读取数据库数据、数据写入csv文件、读取csv文件等

2.数据的处理:对载入到内存的数据进行一系列的操作(处理总共包含清洗、过滤、分组、统计、排序、计算平均数等一系列操作,本文只简单涉及了其中某几个)

3.对处理后的数据进行可视化分析等

# !/usr/bin/env python
# -*-coding:utf-8-*-
\"\"\"
@Author  : xiaofeng
@Time    : 2018/12/19 15:23
@Desc : Less interests,More interest.(数据分析入门)
@Project : python_appliction
@FileName: analysis1.py
@Software: PyCharm
@Blog    :https://blog.csdn.net/zwx19921215
\"\"\"
import pymysql as db
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# mysql config
mysql_config = {
    \'host\': \'101.0.2.110\',
    \'user\': \'test\',
    \'password\': \'test\',
    \'data \': \'xiaofeng\',
    \'charset\': \'utf8\'
}

\"\"\"
read data from mysql and write to local file
(从数据库中读取数据写入本地文件)
@:param page_no
@:param page_size
@:param path 文件写入路径
\"\"\"


def read_to_csv(page_no, page_size, path):
    # read from data 
    data  = db.connect(**mysql_config)
    if page_no > 1:
        page_no = (page_no - 1) * page_size
    else:
        page_no = 0
    sql = \'select * from zhilian limit \' + str(page_no) + \',\' + str(page_size) + \'\'
    df = pd.read_sql(sql, data )
    print(df)
    data .close()
    # write to csv
    local = path
    df.to_csv(local, encoding=\'gbk\')


\"\"\"
read data from remote address or local address
(读取文件从远程地址或者本地)
\"\"\"


def read_from_csv(path):
    # remote address
    # data_url = \'https://xxxx/zhilian.csv\'
    # local address
    # data_url = \'G:/zhilian.csv\'
    df = pd.read_csv(path, encoding=\'gbk\')
    return df


\"\"\"
数据集的简单(过滤、分组、排序)处理
\"\"\"


def simple_op(path):
    df = read_from_csv(path)
    # top 10 :获取前10行
    print(\'--------------top 10-----------\')
    top = df.head(10)
    print(top)
    # tail 10:获取后10行
    print(\'------------tail 10------------\')
    tail = df.tail(10)
    print(tail)
    # filter:根据指定条件过滤
    print(\'-------------filter----------\')
    special_jobid = df[df.JobID == 595950]
    print(special_jobid)
    special = df[(df.id >= 110) & (df.PublishDate == \'2018-12-18\')]
    print(special)
    # check :检查各列缺失情况
    print(\'-------------check----------------\')
    check = df.info()
    print(check)
    print(\'--------------data describe-----------\')
    # count,平均数,标准差,中位数,最小值,最大值,25 % 分位数,75 % 分位数
    describe = df.describe()
    print(describe)
    # 添加新列
    df2 = df.copy()
    df2[\'AnnualSalaryAvg\'] = (df[\'AnnualSalaryMax\'] + df[\'AnnualSalaryMin\']) / 2
    # 重新排列指定列
    columns = [\'JobID\', \'Job \', \'CompanyName\', \'AnnualSalaryMin\', \'AnnualSalaryMax\', \'AnnualSalaryAvg\']
    df = pd.Data (df2, columns=columns)
    print(df)


\"\"\"
可视化分析
\"\"\"


def visualized_analysis():
    path = \'G:/zhilian.csv\'
    df = read_from_csv(path)
    df_post_count = df.groupby(\'JobLactionStr\')[\'AnnualSalaryMax\'].count().to_ ().reset_index()
    # subplots(1, 1) 表示1x1个子图,figsize=(25, 8) 子图的宽度和高度
    # f, [ax1,ax2] = plt.subplots(1, 2, figsize=(25, 8)) 表示1x2个子图
    f, ax2 = plt.subplots(1, 1, figsize=(25, 8))

    sns.barplot(x=\'JobLactionStr\', y=\'AnnualSalaryMax\', palette=\'Greens_d\', data=df_post_count, ax=ax2)
    ax2.set_ (\'各城市职位数量对比\', fontsize=15)
    ax2.set_xlabel(\'城市\')
    ax2.set_ylabel(\'数量\')
    # 用来正常显示中文标签
    plt.rcParams[\'font.sans-serif\'] = [\'SimHei\']
    # 用来正常显示负号
    plt.rcParams[\'axes.unicode_minus\'] = False
    plt.show()


if __name__ == \'__main__\':
    path = \'G:/zhilian.csv\'
    read_to_csv(0, 100, path)
    df = read_from_csv(path)
    simple_op(path)
    visualized_analysis()

 

\"\"

 

\"\"

\"\"

 

注:由于是入门文章第一篇,所以并没有对数据分析做过深的探索,更深层次的研究将会在后续系列中呈现!

收藏 打印