漂亮的汤提取行和数据

时间:2020-04-01 17:28:24

标签: pandas beautifulsoup

我正在使用Beautiful汤从内部站点提取一些数据。链接上提供的代码适用于我的数据的4列。还有一个标记为th的数据。我如何才能将所有tds放在同一行上? Scrape tables into dataframe with BeautifulSoup

import React from "react";
import "./styles.css";

export default class App extends React.Component {
  state = {
    currentWeek: [],
    currentMonth: [],
    monthGetter: new Date().getMonth(),
    yearGetter: new Date().getFullYear()
  };

  componentDidMount() {
    this.calendarLogicHandler();
    this.weekAgendaLogicHandler();
    console.log(
      "getting current month in componentDidMount",
      this.state.currentMonth
    );
  }

  weekAgendaLogicHandler = () => {
    let currentWeek = this.state.currentMonth
      .filter(week => week.includes(new Date().setHours(0, 0, 0, 0)))
      .flat();
    this.setState({ currentWeek }, () => console.log(this.state.currentWeek));
  };

  calendarLogicHandler = () => {
    const oneDay = 86400000;
    const lastDayOfTheMonth = new Date(
      this.state.yearGetter,
      this.state.monthGetter + 1,
      0
    ).getDate();
    let currentMonth = [
      new Date(this.state.yearGetter, this.state.monthGetter, 1).valueOf()
    ]; //starts on month day 1
    for (let i = 1; i < lastDayOfTheMonth; i++) {
      //push the entire month
      currentMonth.push(Math.max.apply(null, currentMonth) + oneDay);
    }
    //localize the first date of the month dates array and check what day of the week it is
    //spread the days of the week, which are the remaining days of prev month to fill calendar first week
    for (let i = new Date(Math.min(...currentMonth)).getDay(); i > 0; i--) {
      currentMonth.unshift(Math.min.apply(null, currentMonth) - oneDay);
    }
    //spread the days of the week, which are the remaining days of prev month to fill calendar last week
    for (let i = new Date(Math.max(...currentMonth)).getDay(); i < 6; i++) {
      currentMonth.push(Math.max.apply(null, currentMonth) + oneDay);
    }
    let monthInWeeks = [];
    for (let i = 0; i < currentMonth.length; i += 7) {
      let chunk = currentMonth.slice(i, i + 7);
      monthInWeeks.push(chunk);
    }
    currentMonth = monthInWeeks;
    this.setState({ currentMonth });
  };

  render() {
    console.log("current month in render()", this.state.currentMonth);
    return (
      <div className="App">
        <h1>Testing componentDidMount</h1>
      </div>
    );
  }
}

我希望在我的数据框上有5列(包括接收),以便最终输出看起来像

Manager ID   Defect Count    Transactions     DPMO
bedfli          155            2215         898
elojr           26             745          480
torse           0              16            0
jogsn           115            1767         6783

res = []
for tr in rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
    res.append(row)
df = pd.DataFrame(res,columns = ['Manager ID','Defect Count','Transactions','DPMO'])
print(df)

<tr role = 'row'>
<td>bedfli</td>
<th>Receive</th>
<td>155</td>
<td>2215</td>
<td>898</td>

1 个答案:

答案 0 :(得分:0)

使用SimplifiedDoc库的解决方案。

from simplified_scrapy import SimplifiedDoc
html = '''
<table>
<tr>
<td>Manager ID</td>
<th>Process</th>
<td>Defect Count</td>
<td>Transaction</td>
<td>DPMO</td>
</tr>
<tr role = 'row'>
<td>bedfli</td>
<th>Receive</th>
<td>155</td>
<td>2215</td>
<td>898</td>
</tr>
</table>
'''
doc = SimplifiedDoc(html)
# First way
table = doc.getTable(body='table')
# Second way
table = doc.selects('table>tr').children.text
# Third way
table = doc.selects('table>tr').selects('td|th').text
print (table)

结果:

[['Manager ID', 'Process', 'Defect Count', 'Transaction', 'DPMO'], ['bedfli', 'Receive', '155', '2215', '898']]