我正在使用Beautiful汤从内部站点提取一些数据。链接上提供的代码适用于我的数据的4列。还有一个标记为th的数据。我如何才能将所有tds放在同一行上? Scrape tables into dataframe with BeautifulSoup
import React from "react";
import "./styles.css";
export default class App extends React.Component {
state = {
currentWeek: [],
currentMonth: [],
monthGetter: new Date().getMonth(),
yearGetter: new Date().getFullYear()
};
componentDidMount() {
this.calendarLogicHandler();
this.weekAgendaLogicHandler();
console.log(
"getting current month in componentDidMount",
this.state.currentMonth
);
}
weekAgendaLogicHandler = () => {
let currentWeek = this.state.currentMonth
.filter(week => week.includes(new Date().setHours(0, 0, 0, 0)))
.flat();
this.setState({ currentWeek }, () => console.log(this.state.currentWeek));
};
calendarLogicHandler = () => {
const oneDay = 86400000;
const lastDayOfTheMonth = new Date(
this.state.yearGetter,
this.state.monthGetter + 1,
0
).getDate();
let currentMonth = [
new Date(this.state.yearGetter, this.state.monthGetter, 1).valueOf()
]; //starts on month day 1
for (let i = 1; i < lastDayOfTheMonth; i++) {
//push the entire month
currentMonth.push(Math.max.apply(null, currentMonth) + oneDay);
}
//localize the first date of the month dates array and check what day of the week it is
//spread the days of the week, which are the remaining days of prev month to fill calendar first week
for (let i = new Date(Math.min(...currentMonth)).getDay(); i > 0; i--) {
currentMonth.unshift(Math.min.apply(null, currentMonth) - oneDay);
}
//spread the days of the week, which are the remaining days of prev month to fill calendar last week
for (let i = new Date(Math.max(...currentMonth)).getDay(); i < 6; i++) {
currentMonth.push(Math.max.apply(null, currentMonth) + oneDay);
}
let monthInWeeks = [];
for (let i = 0; i < currentMonth.length; i += 7) {
let chunk = currentMonth.slice(i, i + 7);
monthInWeeks.push(chunk);
}
currentMonth = monthInWeeks;
this.setState({ currentMonth });
};
render() {
console.log("current month in render()", this.state.currentMonth);
return (
<div className="App">
<h1>Testing componentDidMount</h1>
</div>
);
}
}
我希望在我的数据框上有5列(包括接收),以便最终输出看起来像
Manager ID Defect Count Transactions DPMO
bedfli 155 2215 898
elojr 26 745 480
torse 0 16 0
jogsn 115 1767 6783
res = []
for tr in rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df = pd.DataFrame(res,columns = ['Manager ID','Defect Count','Transactions','DPMO'])
print(df)
<tr role = 'row'>
<td>bedfli</td>
<th>Receive</th>
<td>155</td>
<td>2215</td>
<td>898</td>
答案 0 :(得分:0)
使用SimplifiedDoc库的解决方案。
from simplified_scrapy import SimplifiedDoc
html = '''
<table>
<tr>
<td>Manager ID</td>
<th>Process</th>
<td>Defect Count</td>
<td>Transaction</td>
<td>DPMO</td>
</tr>
<tr role = 'row'>
<td>bedfli</td>
<th>Receive</th>
<td>155</td>
<td>2215</td>
<td>898</td>
</tr>
</table>
'''
doc = SimplifiedDoc(html)
# First way
table = doc.getTable(body='table')
# Second way
table = doc.selects('table>tr').children.text
# Third way
table = doc.selects('table>tr').selects('td|th').text
print (table)
结果:
[['Manager ID', 'Process', 'Defect Count', 'Transaction', 'DPMO'], ['bedfli', 'Receive', '155', '2215', '898']]