Pandas Dataframe Groupby,其中列A的值==列B的值

时间:2019-11-28 13:03:17

标签: python pandas dataframe

当前正在尝试为看起来像这样的数据集计算比率:

using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using Polly;
using RestSharp;
using System;
using System.Collections.Generic;
using System.Net;

namespace FundsAFE.Graphite
{
    public class RequestExecutor
    {
        private static readonly NLog.Logger logger = NLog.LogManager.GetCurrentClassLogger();

        private IRestClient client;
        private IRestRequest request;
        private Policy<IRestResponse> retryPolicy;

        public IRestResponse LastErrorResponse { get; set; }

        private static readonly List<HttpStatusCode> invalidStatusCodes = new List<HttpStatusCode> {
            HttpStatusCode.BadGateway,
            HttpStatusCode.Unauthorized,
            HttpStatusCode.InternalServerError,
            HttpStatusCode.RequestTimeout,
            HttpStatusCode.BadRequest,
            HttpStatusCode.Forbidden,
            HttpStatusCode.GatewayTimeout
        };


        public RequestExecutor(IRestClient client, IRestRequest request)
        {
            this.client = client;
            this.request = request;
        }

        public IRestResponse Execute(int retryCount, int delay)
        {

            retryPolicy = Policy                
                .HandleResult<IRestResponse>(resp => invalidStatusCodes.Contains(resp.StatusCode) || !IsValidJson(resp))                
                .WaitAndRetry(retryCount, i => TimeSpan.FromMilliseconds(delay), (result, timeSpan, currentRetryCount, context) =>
                {
                    //Status code here is sometimes 0???
                    logger.Error($"Request failed with {result.Result.StatusCode}. Waiting {timeSpan} before next retry. Retry attempt {currentRetryCount}");
                    LastErrorResponse = result.Result;
                });

            var policyResponse = retryPolicy.ExecuteAndCapture(() =>
            {
                var url = client.BuildUri(request);
                logger.Debug(url.ToString());
                var response = client.Execute(request);
                return response;
            });
            if(policyResponse.Result != null)
            {
                return policyResponse.Result;
            } else
            {
                return LastErrorResponse;
            }
        }

        public static bool IsValidJson(IRestResponse response)
        {
            if (response.Content.Length == 0)
            {
                //Empty response treated as invalid
                return false;
            }
            try
            {
                var parsed = JObject.Parse(response.Content);
            }
            catch (JsonReaderException e)
            {
                //Will catch any mallformed json
                return false;
            }
            return true;
        }
    }
}


using Microsoft.VisualStudio.TestTools.UnitTesting;
using FundsAFE.Graphite;
using Moq;
using RestSharp;
using System.Net;
using FluentAssertions;
using System;
using FluentAssertions.Extensions;

namespace FundsAFE.Test.Moq
{
    [TestClass]
    public class MoqUnitTestRequest
    {

        public Mock<IRestClient> CreateMockClientWithStatusCodeAndContent(HttpStatusCode code, string content)
        {
            Mock<IRestClient> mockClient = new Mock<IRestClient>();
            mockClient.Setup(c => c.Execute(It.IsAny<IRestRequest>())).Returns(
                new RestResponse
                {
                    Content = content,
                    StatusCode = code
                }
            );

            mockClient.Setup(c => c.BuildUri(It.IsAny<IRestRequest>())).Returns(
                new Uri("http://fake.fake")
            );

            return mockClient;
        }

        [DataTestMethod]
        [DataRow(HttpStatusCode.BadGateway)]
        [DataRow(HttpStatusCode.Unauthorized)]
        [DataRow(HttpStatusCode.InternalServerError)]
        [DataRow(HttpStatusCode.RequestTimeout)]
        [DataRow(HttpStatusCode.BadRequest)]
        [DataRow(HttpStatusCode.Forbidden)]
        [DataRow(HttpStatusCode.GatewayTimeout)]
        public void TestBadStatusCodesAndRetry(HttpStatusCode httpStatusCode) {
            //Arrange
            Mock<IRestRequest> mockRequest = new Mock<IRestRequest>();
            Mock<IRestClient> mockClient = CreateMockClientWithStatusCodeAndContent(httpStatusCode, "fakecontent");
            RequestExecutor requestExecutor = new RequestExecutor(mockClient.Object, mockRequest.Object);

            int retries = 10;
            int delay = 50;
            int totalWaitTime = (retries * delay) - 10; //10ms error margin

            //Act and Verify            
            var response = requestExecutor.Execute(retryCount: retries, delay: 101);
            mockClient.Verify(x => x.Execute(It.IsAny<IRestRequest>()), Times.Exactly(retries + 1)); //1st failed attempt + 10 retries = 11            

            //Assert            
            requestExecutor.ExecutionTimeOf(re => re.Execute(retries, delay)).Should().BeGreaterOrEqualTo(totalWaitTime.Milliseconds());
            response.Should().NotBeNull();
            response.StatusCode.Should().Be(httpStatusCode);
            requestExecutor.LastErrorResponse.StatusCode.Should().Be(httpStatusCode);
        }

        [DataTestMethod]
        //Empty content
        [DataRow("")]
        //Missing closing quote
        [DataRow("{\"fruit\": \"Apple,\"size\": \"Large\",\"color\": \"Red\"}")]
        //Missing angle bracket
        [DataRow("\"q1\": {\"question\": \"Which one is correct team name in NBA?\",\"options\": \"New York Bulls\",\"Los Angeles Kings\",\"Golden State Warriros\",\"Huston Rocket\"],\"answer\": \"Huston Rocket\"}")]
        //Missing curly bracket
        [DataRow("\"sport\": {\"q1\": {\"question\": \"Which one is correct team name in NBA?\",\"options\": \"New York Bulls\",\"Los Angeles Kings\",\"Golden State Warriros\",\"Huston Rocket\"],\"answer\": \"Huston Rocket\"}")]
        public void TestBadContentRetries(string content)
        {

            //Arrange
            Mock<IRestRequest> mockRequest = new Mock<IRestRequest>();
            Mock<IRestClient> mockClient = CreateMockClientWithStatusCodeAndContent(HttpStatusCode.OK, content);
            RequestExecutor requestExecutor = new RequestExecutor(mockClient.Object, mockRequest.Object);

            int retries = 10;
            int delay = 50;
            int totalWaitTime = (retries * delay) - 10; //10ms error margin

            //Act and Verify            
            var response = requestExecutor.Execute(retryCount: retries, delay: delay);
            mockClient.Verify(x => x.Execute(It.IsAny<IRestRequest>()), Times.Exactly(retries + 1)); //1st failed attempt + 10 retries = 11            

            //Assert            
            requestExecutor.ExecutionTimeOf(re => re.Execute(retries, delay)).Should().BeGreaterOrEqualTo(totalWaitTime.Milliseconds());
            response.Should().NotBeNull();

        }
    }
}

此数据集是一个熊猫数据框。我的目标是计算从一个国家迁移到另一个国家的比率。例如,从“ foo”到“ bar”的迁移率与从“ bar”到“ foo”的迁移率。 在这种情况下,它将是123/222 = 0.55

此外,如果可能的话,例如,以以下方式将它们分组到单个数据集或多个子集中:

Country A | Country B | Migrants from A to B
   foo         bar           123
   foo         qux           221
   bar         qux           133
   qux         foo           312
   bar         foo           222    

如何使用熊猫,numpy等来完成??

曾经试图像这样对它们进行分组(尽管我什至无法开始合理化):

Country A | Country B |  A to B ratio
   foo         bar           0.55
   bar         foo           1.88
   foo         qux           0.71
   qux         foo           1.41
   ..           ..            ..  

由于我缺乏对熊猫操作的知识,所以无法想到可能性。 即使有解决此问题的丑陋方法,任何建议也将有所帮助。 谢谢!

4 个答案:

答案 0 :(得分:3)

您可以将表merge自身(在SQL中自行连接):

df.columns = ['Country_A', 'Country_B', 'A_to_B']

df1 = pd.merge(df, df, left_on=['Country_A', 'Country_B'], right_on=['Country_B', 'Country_A'])
df['ratio'] = df1['A_to_B_x'] / df1['A_to_B_y']

答案 1 :(得分:2)

使用DataFrame.pivot_tableDataFrame.lookup进行映射:

mapper=df.pivot_table(index='CountryA',columns='CountryB',values='MigrantsfromAtoB')
df['ratio']=df['MigrantsfromAtoB']/mapper.lookup(df['CountryB'],df['CountryA'])
print(df)
  CountryA CountryB  MigrantsfromAtoB     ratio
0      foo      bar               123  0.554054
1      foo      qux               221  0.708333
2      bar      qux               133       NaN
3      qux      foo               312  1.411765
4      bar      foo               222  1.804878

答案 2 :(得分:1)

您可以循环执行此操作,但@ansev的回答更合乎逻辑:

df = pd.DataFrame({
"A": ["foo", "foo", "bar", "qux", "bar", "qux"], "B": ["bar", "qux", "qux", 
"foo", "foo", "bar"], "AtoB": [123, 221, 133, 312, 222, 444]
})

df["duals"] = df["A"] + df["B"]
df["inv_duals"] = df["B"] + df["A"]

for i in range(len(df)):
    df.loc[i,"AtoB_Ratio"] = df.loc[i, "AtoB"] / df.loc[df["duals"].isin([df.loc[i, "inv_duals"]]), "AtoB"].item()


df = df.drop(["duals", "inv_duals"], axis=1)

输出:

A   B   AtoB    AtoB_Ratio
0   foo bar 123 0.554054
1   foo qux 221 0.708333
2   bar qux 133 0.299550
3   qux foo 312 1.411765
4   bar foo 222 1.804878
5   qux bar 444 3.338346

答案 3 :(得分:1)

df.columns = ['A','B','AtoB']

df1 = df.pivot_table('AtoB', index = ['A'], columns = ['B'])
df2 = pd.DataFrame(df1.to_numpy()/df1.to_numpy().T, columns = df1.columns, index = df1.index)
df2

Out[1]:
  B      bar       foo          qux
  A         
bar     NaN        1.804878     NaN
foo     0.554054   NaN          0.708333
qux     NaN        1.411765     NaN


df2 = df2.reset_index().melt(id_vars = ['A'],value_vars = df1.columns,  value_name='ratio')
df2.drop(df2[df2['A']==df2['B']].index)

Out[2]:
    A   B   ratio
1   foo bar 0.554054
2   qux bar NaN
3   bar foo 1.804878
5   qux foo 1.411765
6   bar qux NaN
7   foo qux 0.708333