From ef7d7c19441aecfcb9228530c093d6e02187068a Mon Sep 17 00:00:00 2001 From: tengge1 <930372551@qq.com> Date: Fri, 30 Aug 2019 20:44:27 +0800 Subject: [PATCH] feature_columns.py --- ShadowEditor.AI/README.md | 1 + ShadowEditor.AI/tensorflow/feature_columns.py | 134 ++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 ShadowEditor.AI/tensorflow/feature_columns.py diff --git a/ShadowEditor.AI/README.md b/ShadowEditor.AI/README.md index fbc6fe8a..51569cb7 100644 --- a/ShadowEditor.AI/README.md +++ b/ShadowEditor.AI/README.md @@ -33,6 +33,7 @@ pip install tensorflow-gpu==2.0.0-rc0 4. tensorflow/mnist_expert.py: 专家级手写数字识别,准确度:98.112% 5. tensorflow/image_classification.py: 服装图片分类,准确度:87.81% 6. tensorflow/basic_text_classification.py: 评论文本分类,准确度:86.2% +7. tensorflow/feature_columns.py: 对结构化数据进行分类,准确度:72.54% ## 相关地址 diff --git a/ShadowEditor.AI/tensorflow/feature_columns.py b/ShadowEditor.AI/tensorflow/feature_columns.py new file mode 100644 index 00000000..f7d02e52 --- /dev/null +++ b/ShadowEditor.AI/tensorflow/feature_columns.py @@ -0,0 +1,134 @@ +# https://tensorflow.google.cn/beta/tutorials/keras/feature_columns + +from __future__ import absolute_import, division, print_function, unicode_literals + +import numpy as np +import pandas as pd +import tensorflow as tf + +from tensorflow import feature_column +from tensorflow.keras import layers +from sklearn.model_selection import train_test_split + +URL = 'https://storage.googleapis.com/applied-dl/heart.csv' +dataframe = pd.read_csv(URL) +dataframe.head() + +train, test = train_test_split(dataframe, test_size=0.2) +train, val = train_test_split(train, test_size=0.2) +print(len(train), 'train examples') +print(len(val), 'validation examples') +print(len(test), 'test examples') + +# 一种从 Pandas Dataframe 创建 tf.data 数据集的实用程序方法(utility method) + + +def df_to_dataset(dataframe, shuffle=True, batch_size=32): + dataframe = dataframe.copy() + labels = dataframe.pop('target') + ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) + if shuffle: + ds = ds.shuffle(buffer_size=len(dataframe)) + ds = ds.batch(batch_size) + return ds + + +batch_size = 5 # 小批量大小用于演示 +train_ds = df_to_dataset(train, batch_size=batch_size) +val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) +test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) + +for feature_batch, label_batch in train_ds.take(1): + print('Every feature:', list(feature_batch.keys())) + print('A batch of ages:', feature_batch['age']) + print('A batch of targets:', label_batch) + +# 我们将使用该批数据演示几种特征列 +example_batch = next(iter(train_ds))[0] + +# 用于创建一个特征列 +# 并转换一批次数据的一个实用程序方法 + + +def demo(feature_column): + feature_layer = layers.DenseFeatures(feature_column) + print(feature_layer(example_batch).numpy()) + + +age = feature_column.numeric_column("age") +demo(age) + +age_buckets = feature_column.bucketized_column( + age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) +demo(age_buckets) + +thal = feature_column.categorical_column_with_vocabulary_list( + 'thal', ['fixed', 'normal', 'reversible']) + +thal_one_hot = feature_column.indicator_column(thal) +demo(thal_one_hot) + +# 注意到嵌入列的输入是我们之前创建的类别列 +thal_embedding = feature_column.embedding_column(thal, dimension=8) +demo(thal_embedding) + +thal_hashed = feature_column.categorical_column_with_hash_bucket( + 'thal', hash_bucket_size=1000) +demo(feature_column.indicator_column(thal_hashed)) + +crossed_feature = feature_column.crossed_column( + [age_buckets, thal], hash_bucket_size=1000) +demo(feature_column.indicator_column(crossed_feature)) + +feature_columns = [] + +# 数值列 +for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']: + feature_columns.append(feature_column.numeric_column(header)) + +# 分桶列 +age_buckets = feature_column.bucketized_column( + age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) +feature_columns.append(age_buckets) + +# 分类列 +thal = feature_column.categorical_column_with_vocabulary_list( + 'thal', ['fixed', 'normal', 'reversible']) +thal_one_hot = feature_column.indicator_column(thal) +feature_columns.append(thal_one_hot) + +# 嵌入列 +thal_embedding = feature_column.embedding_column(thal, dimension=8) +feature_columns.append(thal_embedding) + +# 组合列 +crossed_feature = feature_column.crossed_column( + [age_buckets, thal], hash_bucket_size=1000) +crossed_feature = feature_column.indicator_column(crossed_feature) +feature_columns.append(crossed_feature) + +feature_layer = tf.keras.layers.DenseFeatures(feature_columns) + +batch_size = 32 +train_ds = df_to_dataset(train, batch_size=batch_size) +val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) +test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) + +model = tf.keras.Sequential([ + feature_layer, + layers.Dense(128, activation='relu'), + layers.Dense(128, activation='relu'), + layers.Dense(1, activation='sigmoid') +]) + +model.compile(optimizer='adam', + loss='binary_crossentropy', + metrics=['accuracy'], + run_eagerly=True) + +model.fit(train_ds, + validation_data=val_ds, + epochs=5) + +loss, accuracy = model.evaluate(test_ds) +print("Accuracy", accuracy)