拆分并替换熊猫数据框中的所有字符串

时间:2020-01-28 10:15:33

标签: python pandas dataframe replace split

我有一个很大的数据框,其中每一行都包含一个字符串。 我想将每个字符串分成几列,并替换两个字符类型。

下面的代码可以完成任务,但是在大型数据帧上速度很慢。有没有比使用for循环更快的方法?

import numpy
import numpy as np
from sklearn.linear_model import LogisticRegression
from tensorflow import keras, metrics
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
from webencodings import labels
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True

train_path=r'C:\Users\Acer\imagerec\BAYBAYIN\TRAIN'
valid_path=r'C:\Users\Acer\imagerec\BAYBAYIN\VAL'
test_path=r'C:\Users\Acer\imagerec\BAYBAYIN\TEST'
batch_size=30

class_labels=['A', 'BA', 'KA', 'GA', 'HA', '1', '2', '3', '4', '5', '6', '7',
              '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
              '20', '21', '22', '23', '24', '25', '26', '28', '29', '30', '31', '32',
              '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44']

train_batches=ImageDataGenerator(preprocessing_function=keras.applications.xception.preprocess_input)\
    .flow_from_directory(train_path, target_size=(299,299),classes=class_labels,batch_size=5)
valid_batches=ImageDataGenerator(preprocessing_function=keras.applications.xception.preprocess_input)\
    .flow_from_directory(valid_path, target_size=(299,299),classes=class_labels,batch_size=5)
test_batches=ImageDataGenerator(preprocessing_function=keras.applications.xception.preprocess_input)\
    .flow_from_directory(test_path, target_size=(299,299),classes=class_labels,batch_size=5, shuffle=False)

base_model=keras.applications.vgg19.VGG19(include_top=False)

x=base_model.output
x=GlobalAveragePooling2D()(x)
x=Dense(1024, activation='relu')(x)
x=Dense(48, activation='softmax')(x)
model=Model(inputs=base_model.input, outputs=x)


base_model.trainable = False

N=1

print("HANG ON LEARNING IN PROGRESS...")

model.compile(Adam(lr=.0001),loss='categorical_crossentropy', metrics=['accuracy'])
history=model.fit_generator(train_batches, steps_per_epoch=1290, validation_data=valid_batches,
                            validation_steps=90,epochs=N,verbose=1)

print("[INFO]evaluating model...")

test_labels=test_batches.classes
predictions=model.predict_generator(test_batches, steps=28, verbose=1)


import matplotlib.pyplot as plt
import numpy as np


plt.imshow(np.random.random((48,48)), interpolation='nearest')
plt.xticks(np.arange(0,48), ['A', 'BA', 'KA', 'GA', 'HA', '1', '2', '3', '4', '5', '6', '7',
              '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
              '20', '21', '22', '23', '24', '25', '26', '28', '29', '30', '31', '32',
              '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44'])
plt.yticks(np.arange(0,48),['A', 'BA', 'KA', 'GA', 'HA', '1', '2', '3', '4', '5', '6', '7',
              '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
              '20', '21', '22', '23', '24', '25', '26', '28', '29', '30', '31', '32',
              '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44'])

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# this is the augmentation configuration we will use for testing:
# only rescaling
test_datagen = ImageDataGenerator(rescale=1. / 255)

train_generator = train_path.flow_from_directory(
    train_path,
    batch_size=batch_size,
    class_mode='categorical')

validation_generator = test_datagen.flow_from_directory(
    valid_path,
    batch_size=batch_size,
    class_mode='categorical')

model.fit_generator(
    train_generator,
    steps_per_epoch= 52800 // batch_size,
    epochs=N,
    validation_data=validation_generator,
    validation_steps= 13200 // batch_size)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns


test_steps_per_epoch = numpy.math.ceil(validation_generator.samples / validation_generator.batch_size)

predictions = model.predict_generator(validation_generator, steps=test_steps_per_epoch)
test_steps_per_epoch = numpy.math.ceil(validation_generator.samples / validation_generator.batch_size)
predicted_classes = numpy.argmax(predictions, axis=1)
true_classes = validation_generator.classes
class_labels = list(validation_generator.class_indices.keys())
report = classification_report(true_classes, predicted_classes, target_names=class_labels)
print(report)

cm=confusion_matrix(true_classes,predicted_classes)
print(cm)

plt.show()
model.save("X19baybayin.h5")

2 个答案:

答案 0 :(得分:1)

您的解决方案应使用Series.str.stripSeries.str.split进行更改:

df1 = df[0].str.strip('[]').str.split(', ', expand=True).add_prefix('col')
print(df1)
  col0 col1 col2
0  3.4  3.4  2.5
1  3.4  3.4  2.5

如果性能很重要,请使用列表理解功能代替熊猫函数:

df1 = pd.DataFrame([x.strip('[]').split(', ') for x in df[0]]).add_prefix('col')

时间

#20k rows
df = pd.concat([df] * 10000, ignore_index=True)

In [208]: %timeit df[0].str.strip('[]').str.split(', ', expand=True).add_prefix('col')
61.5 ms ± 1.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [209]: %timeit pd.DataFrame([x.strip('[]').split(', ') for x in df[0]]).add_prefix('col')
29.8 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

答案 1 :(得分:1)

您可以执行以下操作:

import pandas as pd
df = pd.DataFrame(['[3.4, 3.4, 2.5]', '[3.4, 3.4, 2.5]'])

df_new = df[0].str[1:-1].str.split(",", expand=True)
df_new.columns = ["col1", "col2", "col3"]

这个想法是先摆脱[],然后除以,并扩展数据框。最后一步是重命名列。