26 April 2015

code

        from pyspark.sql import SQLContext
        from pyspark.sql.types import StructField, IntegerType, StringType, StructType
        %matplotlib inline
        import matplotlib.pyplot as plt
        import matplotlib
        print matplotlib.matplotlib_fname()
        matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei']
        matplotlib.rcParams['axes.unicode_minus'] = False

        csv_dt = sc.textFile("hdfs://localhost:9000/python/temp.csv", use_unicode=False)
        header = csv_dt.take(2)
        for h in header: print h

        footer = csv_dt.filter(lambda x: x not in header)
        for f in footer.collect(): print f

        parts = footer.map(lambda x: x.split("|"))
        for p in parts.collect(): print p

        table = parts.map(lambda x: (int(x[0]), x[1].decode('utf-8')))
        table

        for t in table.collect():
            c, n = t
            print c, n

        f_code = StructField("code", IntegerType(), True)
        f_name = StructField("name", StringType(), True)
        sqlCtx = SQLContext(sc)
        schema = StructType([f_code, f_name])
        df = sqlCtx.createDataFrame(table, schema)
        df.registerTempTable("temp_table")

        results = df.select(df.code, df.name).orderBy(df.code)
        resultsPandas = results.toPandas()
        resultsPandas

        #xlabels = df.select(df.name).map(lambda x: x.name).collect()
        #for l in xlabels: print l

        fig = resultsPandas.plot(kind="bar", x="name")
        #fig.set_xticklabels(xlabels, rotation=45)
        #fig

        plt.clf()


blog comments powered by Disqus