diff --git "a/llmtf_eval/darumeru_ruOpenBookQA.jsonl" "b/llmtf_eval/darumeru_ruOpenBookQA.jsonl" --- "a/llmtf_eval/darumeru_ruOpenBookQA.jsonl" +++ "b/llmtf_eval/darumeru_ruOpenBookQA.jsonl" @@ -7,10 +7,10 @@ ] }, "predict": { - "A": 0.019716909155249596, - "B": 0.013551220297813416, - "C": 0.9500139355659485, - "D": 0.004399437922984362 + "A": 0.003998769447207451, + "B": 0.0024253760930150747, + "C": 0.9784665703773499, + "D": 0.001145666465163231 }, "sample": { "messages": [ @@ -45,17 +45,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.42418548464775085, - "B": 0.5446649193763733, - "C": 0.006050681229680777, - "D": 0.007769227959215641 + "A": 0.7286432385444641, + "B": 0.236555814743042, + "C": 0.005563259590417147, + "D": 0.002178603783249855 }, "sample": { "messages": [ @@ -85,7 +85,7 @@ "prompt_len": 56, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -97,10 +97,10 @@ ] }, "predict": { - "A": 0.9326282739639282, - "B": 0.01708168350160122, - "C": 0.0006623287335969508, - "D": 0.0026195610407739878 + "A": 0.8784412741661072, + "B": 0.001165498630143702, + "C": 0.00016790599329397082, + "D": 0.0005505425506271422 }, "sample": { "messages": [ @@ -142,10 +142,10 @@ ] }, "predict": { - "A": 0.07336515188217163, - "B": 0.37257900834083557, - "C": 0.002844675676897168, - "D": 0.5420992374420166 + "A": 0.016887381672859192, + "B": 0.009039164520800114, + "C": 0.024570994079113007, + "D": 0.9220197796821594 }, "sample": { "messages": [ @@ -187,10 +187,10 @@ ] }, "predict": { - "A": 0.004000919405370951, - "B": 0.004533635452389717, - "C": 0.9789926409721375, - "D": 0.0018899004207924008 + "A": 0.0014867965364828706, + "B": 0.0005469618481583893, + "C": 0.9889302849769592, + "D": 0.0002927676250692457 }, "sample": { "messages": [ @@ -232,10 +232,10 @@ ] }, "predict": { - "A": 0.07970964163541794, - "B": 0.042665496468544006, - "C": 0.8569594621658325, - "D": 0.0018745912238955498 + "A": 0.10307756066322327, + "B": 0.0027469240594655275, + "C": 0.8630578517913818, + "D": 0.00025550374994054437 }, "sample": { "messages": [ @@ -277,10 +277,10 @@ ] }, "predict": { - "A": 0.003545519895851612, - "B": 0.9830743670463562, - "C": 0.00039779627695679665, - "D": 0.001897779991850257 + "A": 0.005693153478205204, + "B": 0.9574412107467651, + "C": 0.0005636985297314823, + "D": 0.0008730734116397798 }, "sample": { "messages": [ @@ -322,10 +322,10 @@ ] }, "predict": { - "A": 0.0011607675114646554, - "B": 0.0007040411001071334, - "C": 0.9913636445999146, - "D": 0.000332565454300493 + "A": 0.0004825304204132408, + "B": 6.1346850998234e-05, + "C": 0.9885988235473633, + "D": 3.720874519785866e-05 }, "sample": { "messages": [ @@ -367,10 +367,10 @@ ] }, "predict": { - "A": 0.0002936260134447366, - "B": 0.0009044317412190139, - "C": 0.9918298125267029, - "D": 8.41252549435012e-05 + "A": 0.0005486257723532617, + "B": 0.0001014855588437058, + "C": 0.9919386506080627, + "D": 4.503394666244276e-05 }, "sample": { "messages": [ @@ -412,10 +412,10 @@ ] }, "predict": { - "A": 0.0029724480118602514, - "B": 0.046496905386447906, - "C": 0.000751552521251142, - "D": 0.933915376663208 + "A": 0.0003893633547704667, + "B": 0.010689455084502697, + "C": 0.0003436119295656681, + "D": 0.9622340202331543 }, "sample": { "messages": [ @@ -457,10 +457,10 @@ ] }, "predict": { - "A": 0.0020864021498709917, - "B": 0.1462680548429489, - "C": 0.0005977641558274627, - "D": 0.84171462059021 + "A": 0.00030677972245030105, + "B": 0.01225427258759737, + "C": 9.959678573068231e-05, + "D": 0.9734774231910706 }, "sample": { "messages": [ @@ -502,10 +502,10 @@ ] }, "predict": { - "A": 0.0007950864965096116, - "B": 0.0011568439658731222, - "C": 0.00020102935377508402, - "D": 0.9880127906799316 + "A": 0.00018292297318112105, + "B": 0.00017184023454319686, + "C": 7.163367990870029e-05, + "D": 0.9570062756538391 }, "sample": { "messages": [ @@ -547,10 +547,10 @@ ] }, "predict": { - "A": 0.017562948167324066, - "B": 0.00732132513076067, - "C": 0.0016336087137460709, - "D": 0.9589043855667114 + "A": 0.009316450916230679, + "B": 0.0011844538385048509, + "C": 0.000718407507520169, + "D": 0.9503037929534912 }, "sample": { "messages": [ @@ -592,10 +592,10 @@ ] }, "predict": { - "A": 0.006331128068268299, - "B": 0.022097809240221977, - "C": 0.9396227598190308, - "D": 0.019501248374581337 + "A": 0.011086543090641499, + "B": 0.04968643933534622, + "C": 0.880713164806366, + "D": 0.023470209911465645 }, "sample": { "messages": [ @@ -637,10 +637,10 @@ ] }, "predict": { - "A": 0.35054653882980347, - "B": 0.04744131490588188, - "C": 0.577953577041626, - "D": 0.0014326036907732487 + "A": 0.23214684426784515, + "B": 0.0070102280005812645, + "C": 0.7150626182556152, + "D": 0.00022534340678248554 }, "sample": { "messages": [ @@ -682,10 +682,10 @@ ] }, "predict": { - "A": 0.9611201882362366, - "B": 0.0030590349342674017, - "C": 0.0006023596506565809, - "D": 0.0002845345879904926 + "A": 0.8801038861274719, + "B": 0.00021600365289486945, + "C": 0.00021600365289486945, + "D": 4.527682176558301e-05 }, "sample": { "messages": [ @@ -727,10 +727,10 @@ ] }, "predict": { - "A": 0.010701581835746765, - "B": 0.01557070855051279, - "C": 0.9633256793022156, - "D": 0.0009954021079465747 + "A": 0.0021578343585133553, + "B": 0.0011550054186955094, + "C": 0.9864425659179688, + "D": 0.00010743224993348122 }, "sample": { "messages": [ @@ -772,10 +772,10 @@ ] }, "predict": { - "A": 0.7974213361740112, - "B": 0.15702156722545624, - "C": 0.000641711289063096, - "D": 0.0008239736198447645 + "A": 0.8836753368377686, + "B": 0.030237706378102303, + "C": 0.00011608759814407676, + "D": 0.00021688018750865012 }, "sample": { "messages": [ @@ -817,10 +817,10 @@ ] }, "predict": { - "A": 0.23721423745155334, - "B": 0.34514468908309937, - "C": 0.3911001682281494, - "D": 0.003834211267530918 + "A": 0.34952083230018616, + "B": 0.08837269246578217, + "C": 0.5085498094558716, + "D": 0.0030239473562687635 }, "sample": { "messages": [ @@ -862,10 +862,10 @@ ] }, "predict": { - "A": 0.0021495562978088856, - "B": 0.982658326625824, - "C": 0.0006158581236377358, - "D": 0.0005434929626062512 + "A": 0.0007990816375240684, + "B": 0.9328159093856812, + "C": 0.0004277175758033991, + "D": 0.000515925872605294 }, "sample": { "messages": [ @@ -907,10 +907,10 @@ ] }, "predict": { - "A": 0.9491506814956665, - "B": 0.0072468542493879795, - "C": 0.0012593145947903395, - "D": 0.0030209387186914682 + "A": 0.8780667185783386, + "B": 0.0011650017695501447, + "C": 0.0003337786183692515, + "D": 0.0007066092803142965 }, "sample": { "messages": [ @@ -952,10 +952,10 @@ ] }, "predict": { - "A": 0.9475151896476746, - "B": 0.005634131375700235, - "C": 0.0016142057720571756, - "D": 0.0018291346495971084 + "A": 0.890738844871521, + "B": 0.00043476541759446263, + "C": 0.0008122487342916429, + "D": 0.0008122487342916429 }, "sample": { "messages": [ @@ -997,10 +997,10 @@ ] }, "predict": { - "A": 7.430034747812897e-05, - "B": 0.0001081063601304777, - "C": 0.9926322102546692, - "D": 0.003579990938305855 + "A": 2.899223727581557e-05, + "B": 5.0882965297205374e-05, + "C": 0.9890776872634888, + "D": 0.002451678505167365 }, "sample": { "messages": [ @@ -1042,10 +1042,10 @@ ] }, "predict": { - "A": 0.0040041194297373295, - "B": 0.9797756671905518, - "C": 5.36552288394887e-05, - "D": 7.806788926245645e-05 + "A": 0.002591285388916731, + "B": 0.9225614666938782, + "C": 3.934658525395207e-05, + "D": 5.3780411690240726e-05 }, "sample": { "messages": [ @@ -1087,10 +1087,10 @@ ] }, "predict": { - "A": 0.002750266809016466, - "B": 0.004001614637672901, - "C": 0.0008387839770875871, - "D": 0.9791628122329712 + "A": 0.0010749254142865539, + "B": 0.0006124739884398878, + "C": 0.00012838153634220362, + "D": 0.9772586226463318 }, "sample": { "messages": [ @@ -1132,10 +1132,10 @@ ] }, "predict": { - "A": 0.012013775296509266, - "B": 0.9543724060058594, - "C": 0.005008086562156677, - "D": 0.003037558402866125 + "A": 0.008807304315268993, + "B": 0.8983694314956665, + "C": 0.0014377528568729758, + "D": 0.0018461111467331648 }, "sample": { "messages": [ @@ -1177,10 +1177,10 @@ ] }, "predict": { - "A": 0.003975218627601862, - "B": 0.9727038741111755, - "C": 0.0011389191495254636, - "D": 0.0006907893694005907 + "A": 0.003271061461418867, + "B": 0.9069746732711792, + "C": 0.0010619581444188952, + "D": 0.0005339860799722373 }, "sample": { "messages": [ @@ -1222,10 +1222,10 @@ ] }, "predict": { - "A": 0.0024585870560258627, - "B": 0.0010248915059491992, - "C": 0.9918647408485413, - "D": 0.00025913314311765134 + "A": 0.0013024430954828858, + "B": 0.0001762664905982092, + "C": 0.9816568493843079, + "D": 8.326239913003519e-05 }, "sample": { "messages": [ @@ -1267,10 +1267,10 @@ ] }, "predict": { - "A": 0.9016233682632446, - "B": 0.030851854011416435, - "C": 0.0025324744638055563, - "D": 0.0013555358164012432 + "A": 0.8882097601890564, + "B": 0.003203384578227997, + "C": 0.0005229381495155394, + "D": 0.0003171779972035438 }, "sample": { "messages": [ @@ -1312,10 +1312,10 @@ ] }, "predict": { - "A": 0.09269159287214279, - "B": 0.8794333934783936, - "C": 0.0059255752712488174, - "D": 0.0067145563662052155 + "A": 0.03580572083592415, + "B": 0.9234417080879211, + "C": 0.0024366099387407303, + "D": 0.009053104557096958 }, "sample": { "messages": [ @@ -1357,10 +1357,10 @@ ] }, "predict": { - "A": 0.8273126482963562, - "B": 0.06790995597839355, - "C": 0.024982677772641182, - "D": 0.0363495834171772 + "A": 0.8639833331108093, + "B": 0.01793140172958374, + "C": 0.009597988799214363, + "D": 0.01396499015390873 }, "sample": { "messages": [ @@ -1395,17 +1395,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.3977893888950348, - "B": 0.5787801146507263, - "C": 0.0008701607002876699, - "D": 0.0002493052161298692 + "A": 0.8006658554077148, + "B": 0.10835833847522736, + "C": 0.0007301127188839018, + "D": 0.0001350572711089626 }, "sample": { "messages": [ @@ -1435,7 +1435,7 @@ "prompt_len": 82, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -1447,10 +1447,10 @@ ] }, "predict": { - "A": 0.004375998396426439, - "B": 0.00561889261007309, - "C": 0.025182131677865982, - "D": 0.9449524879455566 + "A": 0.00015363724378403276, + "B": 0.00017409380234312266, + "C": 0.0006076470599509776, + "D": 0.9695567488670349 }, "sample": { "messages": [ @@ -1492,10 +1492,10 @@ ] }, "predict": { - "A": 0.008475230075418949, - "B": 0.0035330012906342745, - "C": 0.9796033501625061, - "D": 0.001146997557953 + "A": 0.005844385828822851, + "B": 0.0006557219894602895, + "C": 0.9828746318817139, + "D": 0.0005436125793494284 }, "sample": { "messages": [ @@ -1537,10 +1537,10 @@ ] }, "predict": { - "A": 0.20769399404525757, - "B": 0.7249232530593872, - "C": 0.0318509042263031, - "D": 0.0055348570458590984 + "A": 0.02297513745725155, + "B": 0.8621357083320618, + "C": 0.03787960112094879, + "D": 0.004524073097854853 }, "sample": { "messages": [ @@ -1582,10 +1582,10 @@ ] }, "predict": { - "A": 0.9376970529556274, - "B": 0.01946128159761429, - "C": 0.001409770455211401, - "D": 0.0023243187461048365 + "A": 0.8471231460571289, + "B": 0.0030552030075341463, + "C": 0.0007724763127043843, + "D": 0.0011239463929086924 }, "sample": { "messages": [ @@ -1623,14 +1623,14 @@ "acc": false, "f1_macro": [ "C", - "D" + "A" ] }, "predict": { - "A": 0.015614913776516914, - "B": 0.0023946247529238462, - "C": 0.0023946247529238462, - "D": 0.9660605192184448 + "A": 0.5907383561134338, + "B": 0.0029120962135493755, + "C": 0.020213954150676727, + "D": 0.3161994516849518 }, "sample": { "messages": [ @@ -1660,22 +1660,22 @@ "prompt_len": 96, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "D" ] }, "predict": { - "A": 0.04574642330408096, - "B": 0.6315099000930786, - "C": 0.0029244711622595787, - "D": 0.29830417037010193 + "A": 0.05643561854958534, + "B": 0.22320719063282013, + "C": 0.003183879889547825, + "D": 0.6875265836715698 }, "sample": { "messages": [ @@ -1705,7 +1705,7 @@ "prompt_len": 88, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -1717,10 +1717,10 @@ ] }, "predict": { - "A": 0.00020030708401463926, - "B": 0.9844629764556885, - "C": 6.503018084913492e-05, - "D": 0.00037422290188260376 + "A": 0.00014619788271375, + "B": 0.922609269618988, + "C": 7.825408829376101e-05, + "D": 0.0005431910394690931 }, "sample": { "messages": [ @@ -1755,17 +1755,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "A" + "C" ] }, "predict": { - "A": 0.49102476239204407, - "B": 0.04567241668701172, - "C": 0.4333278238773346, - "D": 0.004248196724802256 + "A": 0.3728393316268921, + "B": 0.018562575802206993, + "C": 0.5424780249595642, + "D": 0.0028466633521020412 }, "sample": { "messages": [ @@ -1795,7 +1795,7 @@ "prompt_len": 83, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -1807,10 +1807,10 @@ ] }, "predict": { - "A": 0.002153996843844652, - "B": 0.9846882224082947, - "C": 0.0006993003189563751, - "D": 0.0019008953822776675 + "A": 0.0008588286582380533, + "B": 0.9418200254440308, + "C": 0.00045969788334332407, + "D": 0.0018181403866037726 }, "sample": { "messages": [ @@ -1845,17 +1845,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.003554937895387411, - "B": 0.6774512529373169, - "C": 0.28240370750427246, - "D": 0.0021561789326369762 + "A": 0.0017114710062742233, + "B": 0.1540617048740387, + "C": 0.7823899388313293, + "D": 0.0008084416622295976 }, "sample": { "messages": [ @@ -1885,7 +1885,7 @@ "prompt_len": 92, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -1897,10 +1897,10 @@ ] }, "predict": { - "A": 0.9329296946525574, - "B": 0.013307526707649231, - "C": 0.0005159887950867414, - "D": 0.01708720438182354 + "A": 0.8905201554298401, + "B": 0.0006324246060103178, + "C": 8.558935951441526e-05, + "D": 0.0017191083170473576 }, "sample": { "messages": [ @@ -1942,10 +1942,10 @@ ] }, "predict": { - "A": 0.0002543233858887106, - "B": 0.9734548330307007, - "C": 0.00037003838224336505, - "D": 0.001658397144638002 + "A": 0.00025003388873301446, + "B": 0.8990522623062134, + "C": 0.0002833255275618285, + "D": 0.0017355792224407196 }, "sample": { "messages": [ @@ -1987,10 +1987,10 @@ ] }, "predict": { - "A": 0.0016645704163238406, - "B": 0.9770784974098206, - "C": 0.0006123611819930375, - "D": 0.0006938961450941861 + "A": 0.0022165998816490173, + "B": 0.9519137740135193, + "C": 0.0004945903201587498, + "D": 0.0009836102835834026 }, "sample": { "messages": [ @@ -2025,17 +2025,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.396738737821579, - "B": 0.03256629779934883, - "C": 0.03690245375037193, - "D": 0.5094226598739624 + "A": 0.6122057437896729, + "B": 0.0030179214663803577, + "C": 0.018487010151147842, + "D": 0.2891855239868164 }, "sample": { "messages": [ @@ -2065,7 +2065,7 @@ "prompt_len": 59, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -2077,10 +2077,10 @@ ] }, "predict": { - "A": 0.022163333371281624, - "B": 0.5715999603271484, - "C": 0.001250369823537767, - "D": 0.3928545117378235 + "A": 0.029049944132566452, + "B": 0.5834837555885315, + "C": 0.0006831891951151192, + "D": 0.3539007902145386 }, "sample": { "messages": [ @@ -2122,10 +2122,10 @@ ] }, "predict": { - "A": 0.10249520093202591, - "B": 0.8581818342208862, - "C": 0.004503325093537569, - "D": 0.003095086896792054 + "A": 0.12016844749450684, + "B": 0.7835966944694519, + "C": 0.005279832985252142, + "D": 0.001714110840111971 }, "sample": { "messages": [ @@ -2167,10 +2167,10 @@ ] }, "predict": { - "A": 0.01069941557943821, - "B": 0.002705235965549946, - "C": 0.003473591525107622, - "D": 0.9631306529045105 + "A": 0.0014738230966031551, + "B": 0.00035006366670131683, + "C": 0.0005093395593576133, + "D": 0.9209076166152954 }, "sample": { "messages": [ @@ -2212,10 +2212,10 @@ ] }, "predict": { - "A": 0.8085614442825317, - "B": 0.010178286582231522, - "C": 0.14050689339637756, - "D": 0.0017687209183350205 + "A": 0.7401411533355713, + "B": 0.0007647860911674798, + "C": 0.12861724197864532, + "D": 0.0005595294642262161 }, "sample": { "messages": [ @@ -2257,10 +2257,10 @@ ] }, "predict": { - "A": 0.032935429364442825, - "B": 0.06972429901361465, - "C": 0.017629064619541168, - "D": 0.8494158983230591 + "A": 0.020282583311200142, + "B": 0.062474753707647324, + "C": 0.010856484062969685, + "D": 0.8624373078346252 }, "sample": { "messages": [ @@ -2302,10 +2302,10 @@ ] }, "predict": { - "A": 0.9549368023872375, - "B": 0.000986733939498663, - "C": 0.0007684691809117794, - "D": 6.307978765107691e-05 + "A": 0.8168748021125793, + "B": 0.00010731207294156775, + "C": 0.000310517760226503, + "D": 1.4523110621667001e-05 }, "sample": { "messages": [ @@ -2347,10 +2347,10 @@ ] }, "predict": { - "A": 0.002765294862911105, - "B": 0.00048053619684651494, - "C": 0.000176779372850433, - "D": 0.9845131039619446 + "A": 0.0008869935991242528, + "B": 7.750463555566967e-05, + "C": 8.250325481640175e-05, + "D": 0.9727066159248352 }, "sample": { "messages": [ @@ -2392,10 +2392,10 @@ ] }, "predict": { - "A": 0.02895169146358967, - "B": 0.002097253454849124, - "C": 0.9587483406066895, - "D": 0.0011225788621231914 + "A": 0.013842277228832245, + "B": 0.00015377381350845098, + "C": 0.9704186320304871, + "D": 0.0001974494953174144 }, "sample": { "messages": [ @@ -2437,10 +2437,10 @@ ] }, "predict": { - "A": 0.0011476267827674747, - "B": 0.980140745639801, - "C": 0.00013706448953598738, - "D": 0.00847987923771143 + "A": 0.0023329739924520254, + "B": 0.9411888122558594, + "C": 0.00013161738752387464, + "D": 0.007186064962297678 }, "sample": { "messages": [ @@ -2482,10 +2482,10 @@ ] }, "predict": { - "A": 0.019351666793227196, - "B": 0.008066975511610508, - "C": 0.013300193473696709, - "D": 0.9324155449867249 + "A": 0.0033851785119622946, + "B": 0.0007095719920471311, + "C": 0.002987409709021449, + "D": 0.938616156578064 }, "sample": { "messages": [ @@ -2527,10 +2527,10 @@ ] }, "predict": { - "A": 0.007420563139021397, - "B": 0.9719020128250122, - "C": 0.0005375438486225903, - "D": 0.00028772649238817394 + "A": 0.009204134345054626, + "B": 0.9388471841812134, + "C": 0.0007555213524028659, + "D": 0.00026110117323696613 }, "sample": { "messages": [ @@ -2572,10 +2572,10 @@ ] }, "predict": { - "A": 0.8692196607589722, - "B": 0.07134989649057388, - "C": 0.02974306233227253, - "D": 0.0031348958145827055 + "A": 0.9049721956253052, + "B": 0.012908734381198883, + "C": 0.021282905712723732, + "D": 0.0014483199920505285 }, "sample": { "messages": [ @@ -2610,17 +2610,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "A" + "C" ] }, "predict": { - "A": 0.6435719132423401, - "B": 0.007149438839405775, - "C": 0.3040018677711487, - "D": 0.0001155599020421505 + "A": 0.24799859523773193, + "B": 0.001148461364209652, + "C": 0.6741299629211426, + "D": 5.3714240493718535e-05 }, "sample": { "messages": [ @@ -2650,22 +2650,22 @@ "prompt_len": 88, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "C" ] }, "predict": { - "A": 0.20097938179969788, - "B": 0.030821185559034348, - "C": 0.2580626308917999, - "D": 0.4821244776248932 + "A": 0.20616838335990906, + "B": 0.007509682327508926, + "C": 0.6350433230400085, + "D": 0.1103539764881134 }, "sample": { "messages": [ @@ -2695,7 +2695,7 @@ "prompt_len": 87, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " C" } } { @@ -2707,10 +2707,10 @@ ] }, "predict": { - "A": 0.00870422925800085, - "B": 0.000630532274954021, - "C": 0.3701132833957672, - "D": 0.6102136373519897 + "A": 0.008184937760233879, + "B": 0.00016987296112347394, + "C": 0.21109232306480408, + "D": 0.7367845773696899 }, "sample": { "messages": [ @@ -2752,10 +2752,10 @@ ] }, "predict": { - "A": 0.004551015794277191, - "B": 0.0018971455283463001, - "C": 0.9827456474304199, - "D": 9.445330942980945e-05 + "A": 0.0024385720025748014, + "B": 0.0003100296307820827, + "C": 0.9837900996208191, + "D": 3.702775575220585e-05 }, "sample": { "messages": [ @@ -2797,10 +2797,10 @@ ] }, "predict": { - "A": 0.9511327743530273, - "B": 0.0007654079818166792, - "C": 8.067340240813792e-05, - "D": 0.00010358670988352969 + "A": 0.8929701447486877, + "B": 0.00014150122296996415, + "C": 4.054078453918919e-05, + "D": 2.617509562696796e-05 }, "sample": { "messages": [ @@ -2842,10 +2842,10 @@ ] }, "predict": { - "A": 0.9144524335861206, - "B": 0.01897885464131832, - "C": 0.01897885464131832, - "D": 0.002910501789301634 + "A": 0.8053324222564697, + "B": 0.008946435526013374, + "C": 0.05833808705210686, + "D": 0.001761657535098493 }, "sample": { "messages": [ @@ -2887,10 +2887,10 @@ ] }, "predict": { - "A": 0.9394080638885498, - "B": 0.02209276147186756, - "C": 0.0023285599891096354, - "D": 0.004350322764366865 + "A": 0.9522002339363098, + "B": 0.0034341702703386545, + "C": 0.0006762281991541386, + "D": 0.0011149118654429913 }, "sample": { "messages": [ @@ -2932,10 +2932,10 @@ ] }, "predict": { - "A": 0.001022498938255012, - "B": 0.0006601749337278306, - "C": 0.9895492792129517, - "D": 0.00037615635665133595 + "A": 0.00037724358844570816, + "B": 0.0001015337256831117, + "C": 0.9924094676971436, + "D": 8.417441858910024e-05 }, "sample": { "messages": [ @@ -2977,10 +2977,10 @@ ] }, "predict": { - "A": 0.9211418628692627, - "B": 0.03151974081993103, - "C": 0.005477309692651033, - "D": 0.007033004891127348 + "A": 0.898381233215332, + "B": 0.002859350759536028, + "C": 0.0013506615068763494, + "D": 0.0017342838691547513 }, "sample": { "messages": [ @@ -3022,10 +3022,10 @@ ] }, "predict": { - "A": 0.9349660277366638, - "B": 0.02823352813720703, - "C": 0.000752398045733571, - "D": 0.0009660982177592814 + "A": 0.9217290282249451, + "B": 0.0017793556908145547, + "C": 0.0001460584026062861, + "D": 0.00022621969401370734 }, "sample": { "messages": [ @@ -3067,10 +3067,10 @@ ] }, "predict": { - "A": 0.08162816613912582, - "B": 0.003586491337046027, - "C": 0.01607353985309601, - "D": 0.8775854706764221 + "A": 0.06313378363847733, + "B": 0.00042539212154224515, + "C": 0.020496539771556854, + "D": 0.8715350031852722 }, "sample": { "messages": [ @@ -3112,10 +3112,10 @@ ] }, "predict": { - "A": 0.015221492387354374, - "B": 0.9417204260826111, - "C": 0.003396374173462391, - "D": 0.009232302196323872 + "A": 0.018451465293765068, + "B": 0.8890414237976074, + "C": 0.005990313831716776, + "D": 0.00678791431710124 }, "sample": { "messages": [ @@ -3157,10 +3157,10 @@ ] }, "predict": { - "A": 0.006610890384763479, - "B": 0.9811431169509888, - "C": 0.0004226200981065631, - "D": 0.0021462419535964727 + "A": 0.001850851229391992, + "B": 0.9587646722793579, + "C": 0.00030214316211640835, + "D": 0.007320258300751448 }, "sample": { "messages": [ @@ -3202,10 +3202,10 @@ ] }, "predict": { - "A": 0.00033191966940648854, - "B": 0.0009022512240335345, - "C": 0.9894385933876038, - "D": 0.00033191966940648854 + "A": 8.892855112208053e-05, + "B": 9.466394840274006e-05, + "C": 0.9849372506141663, + "D": 8.354063174920157e-05 }, "sample": { "messages": [ @@ -3247,10 +3247,10 @@ ] }, "predict": { - "A": 0.0007908020634204149, - "B": 0.9826886653900146, - "C": 0.000155718153109774, - "D": 0.00032965533318929374 + "A": 0.000665134924929589, + "B": 0.936579704284668, + "C": 0.00020285467326175421, + "D": 0.0005180075531825423 }, "sample": { "messages": [ @@ -3292,10 +3292,10 @@ ] }, "predict": { - "A": 9.487453644396737e-05, - "B": 0.9871283769607544, - "C": 8.372648153454065e-05, - "D": 0.0002922341809608042 + "A": 0.00014675834972877055, + "B": 0.9261462092399597, + "C": 0.00010086544352816418, + "D": 0.00031068743555806577 }, "sample": { "messages": [ @@ -3337,10 +3337,10 @@ ] }, "predict": { - "A": 0.07197012007236481, - "B": 0.8767755627632141, - "C": 0.00974009744822979, - "D": 0.007585594896227121 + "A": 0.09915570169687271, + "B": 0.8302205204963684, + "C": 0.002994242822751403, + "D": 0.003187354886904359 }, "sample": { "messages": [ @@ -3382,10 +3382,10 @@ ] }, "predict": { - "A": 0.05168738588690758, - "B": 0.9161807894706726, - "C": 0.002573363482952118, - "D": 0.00480767572298646 + "A": 0.01816960796713829, + "B": 0.8754607439041138, + "C": 0.0045939963310956955, + "D": 0.016034623607993126 }, "sample": { "messages": [ @@ -3427,10 +3427,10 @@ ] }, "predict": { - "A": 0.9269450306892395, - "B": 0.004292607307434082, - "C": 0.00024217230384238064, - "D": 0.040727123618125916 + "A": 0.9211512207984924, + "B": 0.0005423325928859413, + "C": 6.894985563121736e-05, + "D": 0.01023306604474783 }, "sample": { "messages": [ @@ -3472,10 +3472,10 @@ ] }, "predict": { - "A": 0.9717972278594971, - "B": 0.010795692913234234, - "C": 0.0002538903208915144, - "D": 0.00041859439807012677 + "A": 0.9034465551376343, + "B": 0.0009335291688330472, + "C": 6.352746277116239e-05, + "D": 5.606278500636108e-05 }, "sample": { "messages": [ @@ -3510,17 +3510,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.14983032643795013, - "B": 0.6714929342269897, - "C": 0.022977223619818687, - "D": 0.07077483087778091 + "A": 0.40592899918556213, + "B": 0.40592899918556213, + "C": 0.04848130792379379, + "D": 0.06225122883915901 }, "sample": { "messages": [ @@ -3550,7 +3550,7 @@ "prompt_len": 96, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -3562,10 +3562,10 @@ ] }, "predict": { - "A": 0.12518569827079773, - "B": 0.12518569827079773, - "C": 0.7203940153121948, - "D": 0.0010174488415941596 + "A": 0.06396497040987015, + "B": 0.009809345938265324, + "C": 0.883009135723114, + "D": 0.0004048800328746438 }, "sample": { "messages": [ @@ -3607,10 +3607,10 @@ ] }, "predict": { - "A": 0.9724847674369812, - "B": 0.0035073277540504932, - "C": 0.000154101217049174, - "D": 0.00028789901989512146 + "A": 0.9315578937530518, + "B": 0.0005152301164343953, + "C": 5.7807214034255594e-05, + "D": 0.00013867230154573917 }, "sample": { "messages": [ @@ -3652,10 +3652,10 @@ ] }, "predict": { - "A": 0.008268820121884346, - "B": 0.01983586698770523, - "C": 0.0012680646032094955, - "D": 0.9557456374168396 + "A": 0.0032670414075255394, + "B": 0.001981560606509447, + "C": 0.0002854709164239466, + "D": 0.9642829895019531 }, "sample": { "messages": [ @@ -3697,10 +3697,10 @@ ] }, "predict": { - "A": 0.8650660514831543, - "B": 0.006604860536754131, - "C": 0.00025609825388528407, - "D": 0.11707396060228348 + "A": 0.6425434947013855, + "B": 0.00713801383972168, + "C": 0.0004857479070778936, + "D": 0.30351606011390686 }, "sample": { "messages": [ @@ -3742,10 +3742,10 @@ ] }, "predict": { - "A": 0.006485238205641508, - "B": 0.0032609826885163784, - "C": 0.9624946713447571, - "D": 0.01555727794766426 + "A": 0.004018506500869989, + "B": 0.00039788600406609476, + "C": 0.9832960963249207, + "D": 0.004553564358502626 }, "sample": { "messages": [ @@ -3787,10 +3787,10 @@ ] }, "predict": { - "A": 0.0008841687231324613, - "B": 0.9696087837219238, - "C": 0.0007802761974744499, - "D": 0.0007802761974744499 + "A": 0.0013806005008518696, + "B": 0.9182948470115662, + "C": 0.003311888547614217, + "D": 0.001218375633470714 }, "sample": { "messages": [ @@ -3825,17 +3825,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "D" ] }, "predict": { - "A": 0.03467627987265587, - "B": 0.004141490440815687, - "C": 0.6146517395973206, - "D": 0.3289993703365326 + "A": 0.008916371501982212, + "B": 0.0007319003343582153, + "C": 0.05131017789244652, + "D": 0.9094945788383484 }, "sample": { "messages": [ @@ -3865,7 +3865,7 @@ "prompt_len": 67, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " D" } } { @@ -3877,10 +3877,10 @@ ] }, "predict": { - "A": 0.06561755388975143, - "B": 0.010062778368592262, - "C": 0.9058223366737366, - "D": 0.005386216565966606 + "A": 0.07453414797782898, + "B": 0.002890002680942416, + "C": 0.908011794090271, + "D": 0.0013651405461132526 }, "sample": { "messages": [ @@ -3922,10 +3922,10 @@ ] }, "predict": { - "A": 0.0711737796664238, - "B": 0.7651902437210083, - "C": 0.10355724394321442, - "D": 0.02039163000881672 + "A": 0.19859205186367035, + "B": 0.5398291349411011, + "C": 0.19859205186367035, + "D": 0.01630142703652382 }, "sample": { "messages": [ @@ -3967,10 +3967,10 @@ ] }, "predict": { - "A": 0.03236209228634834, - "B": 0.004962887614965439, - "C": 0.004962887614965439, - "D": 0.9457589983940125 + "A": 0.0005960466805845499, + "B": 0.00018178396567236632, + "C": 0.00021927333727944642, + "D": 0.9510473012924194 }, "sample": { "messages": [ @@ -4012,10 +4012,10 @@ ] }, "predict": { - "A": 0.019337790086865425, - "B": 0.008061190135776997, - "C": 0.013290655799210072, - "D": 0.9317469596862793 + "A": 0.006358711048960686, + "B": 0.0017114237416535616, + "C": 0.004370274022221565, + "D": 0.9437163472175598 }, "sample": { "messages": [ @@ -4057,10 +4057,10 @@ ] }, "predict": { - "A": 0.48348021507263184, - "B": 0.006086117587983608, - "C": 0.0004408769018482417, - "D": 0.48348021507263184 + "A": 0.6306511163711548, + "B": 0.0007384165073744953, + "C": 0.00014540282427333295, + "D": 0.3375632166862488 }, "sample": { "messages": [ @@ -4102,10 +4102,10 @@ ] }, "predict": { - "A": 0.9525455832481384, - "B": 0.01058182679116726, - "C": 0.0005268380627967417, - "D": 0.001838845550082624 + "A": 0.8908635377883911, + "B": 0.0017197711858898401, + "C": 9.114453132497147e-05, + "D": 0.0005583279416896403 }, "sample": { "messages": [ @@ -4147,10 +4147,10 @@ ] }, "predict": { - "A": 0.0006700385129079223, - "B": 0.9434844255447388, - "C": 0.0002623908512759954, - "D": 0.0002043501881416887 + "A": 0.0009265969274565578, + "B": 0.8967376947402954, + "C": 0.00034087596577592194, + "D": 0.00032022330560721457 }, "sample": { "messages": [ @@ -4192,10 +4192,10 @@ ] }, "predict": { - "A": 0.7138051986694336, - "B": 0.02155504934489727, - "C": 0.1592714786529541, - "D": 0.04563203826546669 + "A": 0.783800482749939, + "B": 0.0011783962836489081, + "C": 0.05677831918001175, + "D": 0.00220153434202075 }, "sample": { "messages": [ @@ -4237,10 +4237,10 @@ ] }, "predict": { - "A": 0.0024515388067811728, - "B": 0.989021360874176, - "C": 0.0003759556566365063, - "D": 0.0011580248828977346 + "A": 0.0021015654783695936, + "B": 0.9607195258140564, + "C": 0.0002671840484254062, + "D": 0.0006822784198448062 }, "sample": { "messages": [ @@ -4282,10 +4282,10 @@ ] }, "predict": { - "A": 0.005061476957052946, - "B": 0.9645467400550842, - "C": 0.00047079072101041675, - "D": 0.0014501373516395688 + "A": 0.007775741629302502, + "B": 0.8987535834312439, + "C": 0.0004669695917982608, + "D": 0.00209281244315207 }, "sample": { "messages": [ @@ -4327,10 +4327,10 @@ ] }, "predict": { - "A": 0.0013925228267908096, - "B": 0.26536789536476135, - "C": 0.000580489868298173, - "D": 0.7213446497917175 + "A": 0.0009895800612866879, + "B": 0.003048121463507414, + "C": 0.00030180488829500973, + "D": 0.9576912522315979 }, "sample": { "messages": [ @@ -4372,10 +4372,10 @@ ] }, "predict": { - "A": 0.9526099562644958, - "B": 0.0030319488141685724, - "C": 0.00031956503516994417, - "D": 0.0004103296378161758 + "A": 0.8656497001647949, + "B": 0.0006544110365211964, + "C": 0.0001761323364917189, + "D": 0.00018749188166111708 }, "sample": { "messages": [ @@ -4417,10 +4417,10 @@ ] }, "predict": { - "A": 0.9684272408485413, - "B": 0.005081839859485626, - "C": 0.0007793253753334284, - "D": 0.003492694115266204 + "A": 0.8902744054794312, + "B": 0.0006322500994428992, + "C": 0.0004625640285667032, + "D": 0.0017186339246109128 }, "sample": { "messages": [ @@ -4455,17 +4455,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.019304459914565086, - "B": 0.4978685975074768, - "C": 0.43936750292778015, - "D": 0.008047296665608883 + "A": 0.012522611767053604, + "B": 0.2219686061143875, + "C": 0.6837114095687866, + "D": 0.0059152627363801 }, "sample": { "messages": [ @@ -4495,7 +4495,7 @@ "prompt_len": 83, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -4507,10 +4507,10 @@ ] }, "predict": { - "A": 0.0009005481842905283, - "B": 0.003561737248674035, - "C": 0.00022769423958379775, - "D": 0.9875710010528564 + "A": 7.165073475334793e-05, + "B": 0.00028338414267636836, + "C": 8.642728062113747e-05, + "D": 0.9572342038154602 }, "sample": { "messages": [ @@ -4552,10 +4552,10 @@ ] }, "predict": { - "A": 0.05808473005890846, - "B": 0.9085980653762817, - "C": 0.006937231402844191, - "D": 0.0028918685857206583 + "A": 0.22530873119831085, + "B": 0.693999707698822, + "C": 0.009899374097585678, + "D": 0.004126673098653555 }, "sample": { "messages": [ @@ -4597,10 +4597,10 @@ ] }, "predict": { - "A": 0.8706260919570923, - "B": 0.03825264424085617, - "C": 0.043345920741558075, - "D": 0.0045686266385018826 + "A": 0.8565489649772644, + "B": 0.008397310972213745, + "C": 0.02586553804576397, + "D": 0.004222432617098093 }, "sample": { "messages": [ @@ -4642,10 +4642,10 @@ ] }, "predict": { - "A": 0.0034989078994840384, - "B": 0.013838448561728, - "C": 0.9701501727104187, - "D": 0.0005365748656913638 + "A": 0.002370840171352029, + "B": 0.012040119618177414, + "C": 0.9564651846885681, + "D": 0.000599442224483937 }, "sample": { "messages": [ @@ -4687,10 +4687,10 @@ ] }, "predict": { - "A": 0.0027399968821555376, - "B": 0.9755064249038696, - "C": 0.0006927796639502048, - "D": 0.005118987988680601 + "A": 0.0019301417050883174, + "B": 0.9392610192298889, + "C": 0.00031508697429671884, + "D": 0.0014121218118816614 }, "sample": { "messages": [ @@ -4725,17 +4725,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "A" + "D" ] }, "predict": { - "A": 0.4799673855304718, - "B": 0.016423581168055534, - "C": 0.002853990299627185, - "D": 0.4799673855304718 + "A": 0.04597972333431244, + "B": 0.0017828275449573994, + "C": 0.001151080010458827, + "D": 0.9235274791717529 }, "sample": { "messages": [ @@ -4765,7 +4765,7 @@ "prompt_len": 75, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -4777,10 +4777,10 @@ ] }, "predict": { - "A": 0.019606513902544975, - "B": 0.9446947574615479, - "C": 0.008173211477696896, - "D": 0.0038607516326010227 + "A": 0.027559511363506317, + "B": 0.9126456379890442, + "C": 0.011488513089716434, + "D": 0.002728761173784733 }, "sample": { "messages": [ @@ -4822,10 +4822,10 @@ ] }, "predict": { - "A": 0.0006902164313942194, - "B": 0.9718970656394958, - "C": 0.00025391639792360365, - "D": 0.0012894939864054322 + "A": 0.00010606367868604138, + "B": 0.9148721098899841, + "C": 4.421391713549383e-05, + "D": 0.00032669908250682056 }, "sample": { "messages": [ @@ -4867,10 +4867,10 @@ ] }, "predict": { - "A": 0.0016847712686285377, - "B": 0.0021632888820022345, - "C": 0.9889361262321472, - "D": 0.0009017930133268237 + "A": 0.0009059003205038607, + "B": 0.00024381978437304497, + "C": 0.9934403300285339, + "D": 0.00021517016284633428 }, "sample": { "messages": [ @@ -4912,10 +4912,10 @@ ] }, "predict": { - "A": 0.0031362741719931364, - "B": 0.005170841701328754, - "C": 0.1940344125032425, - "D": 0.7674209475517273 + "A": 0.000688644009642303, + "B": 0.0007803358603268862, + "C": 0.07024359703063965, + "D": 0.8557422161102295 }, "sample": { "messages": [ @@ -4957,10 +4957,10 @@ ] }, "predict": { - "A": 0.036350566893815994, - "B": 0.013372625224292278, - "C": 0.9374934434890747, - "D": 0.002050758572295308 + "A": 0.005857205484062433, + "B": 0.0010178297525271773, + "C": 0.9850305914878845, + "D": 0.00022710851044394076 }, "sample": { "messages": [ @@ -5002,10 +5002,10 @@ ] }, "predict": { - "A": 0.8051750659942627, - "B": 0.10896860063076019, - "C": 0.027551576495170593, - "D": 0.027551576495170593 + "A": 0.9163981676101685, + "B": 0.010180264711380005, + "C": 0.003305047983303666, + "D": 0.0022715239319950342 }, "sample": { "messages": [ @@ -5047,10 +5047,10 @@ ] }, "predict": { - "A": 0.9529129266738892, - "B": 0.01058590691536665, - "C": 0.000984642654657364, - "D": 0.0005270412657409906 + "A": 0.890140950679779, + "B": 0.0025002227630466223, + "C": 0.0002635215350892395, + "D": 9.6944160759449e-05 }, "sample": { "messages": [ @@ -5092,10 +5092,10 @@ ] }, "predict": { - "A": 0.9459481835365295, - "B": 0.025208665058016777, - "C": 0.002656973898410797, - "D": 0.0009774459758773446 + "A": 0.9101178646087646, + "B": 0.004214682150632143, + "C": 0.0003053105319850147, + "D": 7.719458517385647e-05 }, "sample": { "messages": [ @@ -5130,17 +5130,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.14266948401927948, - "B": 0.5642687082290649, - "C": 0.07636547088623047, - "D": 0.16166570782661438 + "A": 0.45970800518989563, + "B": 0.35802093148231506, + "C": 0.03773513063788414, + "D": 0.10257472097873688 }, "sample": { "messages": [ @@ -5170,7 +5170,7 @@ "prompt_len": 100, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -5182,10 +5182,10 @@ ] }, "predict": { - "A": 0.015089627355337143, - "B": 0.933562159538269, - "C": 0.0038152553606778383, - "D": 0.009152322076261044 + "A": 0.020409302785992622, + "B": 0.8678256273269653, + "C": 0.003129867836833, + "D": 0.018011145293712616 }, "sample": { "messages": [ @@ -5227,10 +5227,10 @@ ] }, "predict": { - "A": 0.9090956449508667, - "B": 0.005405679810792208, - "C": 0.05811653658747673, - "D": 0.0017549673793837428 + "A": 0.8362614512443542, + "B": 0.0014246717328205705, + "C": 0.06057857722043991, + "D": 0.0009791614720597863 }, "sample": { "messages": [ @@ -5272,10 +5272,10 @@ ] }, "predict": { - "A": 0.9523758292198181, - "B": 0.010579939931631088, - "C": 0.00499760964885354, - "D": 0.0007664082804694772 + "A": 0.9109342098236084, + "B": 0.0017585166497156024, + "C": 0.00155188562348485, + "D": 0.0004176843212917447 }, "sample": { "messages": [ @@ -5317,10 +5317,10 @@ ] }, "predict": { - "A": 0.0007057387847453356, - "B": 0.0006228122510947287, - "C": 0.9937542080879211, - "D": 7.438432658091187e-05 + "A": 0.0002024771529249847, + "B": 6.175204180181026e-05, + "C": 0.9951283931732178, + "D": 2.574208156147506e-05 }, "sample": { "messages": [ @@ -5362,10 +5362,10 @@ ] }, "predict": { - "A": 0.9617673754692078, - "B": 0.007343183737248182, - "C": 0.00028472617850638926, - "D": 0.0007739659631624818 + "A": 0.8901432752609253, + "B": 0.0025002295151352882, + "C": 0.0001325072953477502, + "D": 0.00038342259358614683 }, "sample": { "messages": [ @@ -5403,14 +5403,14 @@ "acc": false, "f1_macro": [ "C", - "B" + "A" ] }, "predict": { - "A": 0.365357905626297, - "B": 0.531592607498169, - "C": 0.026466436684131622, - "D": 0.03398357704281807 + "A": 0.587335467338562, + "B": 0.21606867015361786, + "C": 0.025805721059441566, + "D": 0.04821143299341202 }, "sample": { "messages": [ @@ -5440,7 +5440,7 @@ "prompt_len": 78, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -5452,10 +5452,10 @@ ] }, "predict": { - "A": 0.9661812782287598, - "B": 0.006510077975690365, - "C": 0.00041617537499405444, - "D": 0.0018651686841621995 + "A": 0.938271701335907, + "B": 0.0007550582522526383, + "C": 0.00011579192505450919, + "D": 0.0004875022277701646 }, "sample": { "messages": [ @@ -5497,10 +5497,10 @@ ] }, "predict": { - "A": 0.8987516164779663, - "B": 0.04474620893597603, - "C": 0.008811051025986671, - "D": 0.030753588303923607 + "A": 0.9356836676597595, + "B": 0.013346809893846512, + "C": 0.0023193280212581158, + "D": 0.024935124441981316 }, "sample": { "messages": [ @@ -5542,10 +5542,10 @@ ] }, "predict": { - "A": 0.07722806185483932, - "B": 0.04684119299054146, - "C": 0.8302799463272095, - "D": 0.015207108110189438 + "A": 0.07393859326839447, + "B": 0.024004347622394562, + "C": 0.7949147820472717, + "D": 0.08378340303897858 }, "sample": { "messages": [ @@ -5587,10 +5587,10 @@ ] }, "predict": { - "A": 0.05082181096076965, - "B": 0.9008380770683289, - "C": 0.006069799419492483, - "D": 0.011339878663420677 + "A": 0.1124778762459755, + "B": 0.8311053514480591, + "C": 0.008147870190441608, + "D": 0.01724904030561447 }, "sample": { "messages": [ @@ -5632,10 +5632,10 @@ ] }, "predict": { - "A": 1.8795270079863258e-05, - "B": 0.9931185245513916, - "C": 4.508751226239838e-05, - "D": 0.00013887931709177792 + "A": 1.0924991329375189e-05, + "B": 0.9517455101013184, + "C": 7.123989780666307e-05, + "D": 0.00019365010666660964 }, "sample": { "messages": [ @@ -5677,10 +5677,10 @@ ] }, "predict": { - "A": 0.003918759990483522, - "B": 0.017562665045261383, - "C": 0.9588889479637146, - "D": 0.008296014741063118 + "A": 0.004417977295815945, + "B": 0.11394114047288895, + "C": 0.8419174551963806, + "D": 0.004417977295815945 }, "sample": { "messages": [ @@ -5715,17 +5715,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.02954293228685856, - "B": 0.8633710145950317, - "C": 0.0551934614777565, - "D": 0.01395509298890829 + "A": 0.10033691674470901, + "B": 0.2727440297603607, + "C": 0.5773991346359253, + "D": 0.014454904943704605 }, "sample": { "messages": [ @@ -5755,7 +5755,7 @@ "prompt_len": 59, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -5767,10 +5767,10 @@ ] }, "predict": { - "A": 0.0035479560028761625, - "B": 0.0066284541971981525, - "C": 0.9837498664855957, - "D": 0.0002570130454842001 + "A": 0.0016704392619431019, + "B": 0.0051453146152198315, + "C": 0.980523407459259, + "D": 0.00010031765850726515 }, "sample": { "messages": [ @@ -5812,10 +5812,10 @@ ] }, "predict": { - "A": 0.0008897038642317057, - "B": 0.0024184659123420715, - "C": 0.9756787419319153, - "D": 0.009565218351781368 + "A": 0.00053970399312675, + "B": 0.0008898214437067509, + "C": 0.9758077263832092, + "D": 0.0014670675154775381 }, "sample": { "messages": [ @@ -5850,17 +5850,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.3374611437320709, - "B": 0.6304603815078735, - "C": 0.007003782782703638, - "D": 0.00146807252895087 + "A": 0.7498531341552734, + "B": 0.16731485724449158, + "C": 0.0028788112103939056, + "D": 0.00046995296725071967 }, "sample": { "messages": [ @@ -5890,7 +5890,7 @@ "prompt_len": 66, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -5902,10 +5902,10 @@ ] }, "predict": { - "A": 0.0002577869745437056, - "B": 0.002445814898237586, - "C": 0.9867120981216431, - "D": 0.00020076471264474094 + "A": 0.0002265754883410409, + "B": 0.000790826219599694, + "C": 0.9827187061309814, + "D": 0.00010054226731881499 }, "sample": { "messages": [ @@ -5947,10 +5947,10 @@ ] }, "predict": { - "A": 0.006841400172561407, - "B": 0.021072998642921448, - "C": 0.8960466980934143, - "D": 0.06490940600633621 + "A": 0.0031040499452501535, + "B": 0.0035173494834452868, + "C": 0.9752634763717651, + "D": 0.008437682874500751 }, "sample": { "messages": [ @@ -5992,10 +5992,10 @@ ] }, "predict": { - "A": 0.013840913772583008, - "B": 0.9703229665756226, - "C": 0.0003255070769228041, - "D": 0.0018731639720499516 + "A": 0.01651971973478794, + "B": 0.9019461870193481, + "C": 0.0004402356571517885, + "D": 0.000992086250334978 }, "sample": { "messages": [ @@ -6037,10 +6037,10 @@ ] }, "predict": { - "A": 0.01985677145421505, - "B": 0.9567528367042542, - "C": 0.005689060315489769, - "D": 0.007304897531867027 + "A": 0.01875467412173748, + "B": 0.9036508202552795, + "C": 0.012889886274933815, + "D": 0.011375285685062408 }, "sample": { "messages": [ @@ -6082,10 +6082,10 @@ ] }, "predict": { - "A": 0.028952814638614655, - "B": 0.18879607319831848, - "C": 0.7467029690742493, - "D": 0.015497324988245964 + "A": 0.006628544069826603, + "B": 0.09150423109531403, + "C": 0.8681679964065552, + "D": 0.0027631884440779686 }, "sample": { "messages": [ @@ -6127,10 +6127,10 @@ ] }, "predict": { - "A": 0.00241504842415452, - "B": 0.974299967288971, - "C": 0.0016598369693383574, - "D": 0.0035138744860887527 + "A": 0.004278102889657021, + "B": 0.9238128662109375, + "C": 0.004278102889657021, + "D": 0.00905674323439598 }, "sample": { "messages": [ @@ -6172,10 +6172,10 @@ ] }, "predict": { - "A": 0.003108195262029767, - "B": 0.000890512834303081, - "C": 0.00027159106684848666, - "D": 0.9765658378601074 + "A": 0.0011489003663882613, + "B": 0.00030922231962904334, + "C": 7.818364247214049e-05, + "D": 0.9217787384986877 }, "sample": { "messages": [ @@ -6217,10 +6217,10 @@ ] }, "predict": { - "A": 0.10338612645864487, - "B": 0.006609253119677305, - "C": 0.0021457106340676546, - "D": 0.8656414151191711 + "A": 0.009326113387942314, + "B": 0.0005961984279565513, + "C": 0.0004942658706568182, + "D": 0.9512894153594971 }, "sample": { "messages": [ @@ -6262,10 +6262,10 @@ ] }, "predict": { - "A": 0.0007949374848976731, - "B": 0.0011566270841285586, - "C": 0.00037550186971202493, - "D": 0.9878275394439697 + "A": 0.00014435310731641948, + "B": 0.00019730730855371803, + "C": 0.00011967293539782986, + "D": 0.9697198271751404 }, "sample": { "messages": [ @@ -6307,10 +6307,10 @@ ] }, "predict": { - "A": 0.9730170965194702, - "B": 0.006556137464940548, - "C": 0.001290980027988553, - "D": 0.00025420900783501565 + "A": 0.9445015788078308, + "B": 0.0014200006844475865, + "C": 0.0003372797218617052, + "D": 8.527767204213887e-05 }, "sample": { "messages": [ @@ -6352,10 +6352,10 @@ ] }, "predict": { - "A": 0.5302155017852783, - "B": 0.2210267037153244, - "C": 0.17213577032089233, - "D": 0.03389554098248482 + "A": 0.5075632929801941, + "B": 0.07783735543489456, + "C": 0.30785268545150757, + "D": 0.01631559617817402 }, "sample": { "messages": [ @@ -6397,10 +6397,10 @@ ] }, "predict": { - "A": 0.004011578857898712, - "B": 0.003124220995232463, - "C": 0.9816009998321533, - "D": 0.002757115289568901 + "A": 0.0011604229221120477, + "B": 0.0003324667632114142, + "C": 0.9910694360733032, + "D": 0.000661189085803926 }, "sample": { "messages": [ @@ -6442,10 +6442,10 @@ ] }, "predict": { - "A": 0.0014467071741819382, - "B": 0.017624499276280403, - "C": 0.11492617428302765, - "D": 0.8491959571838379 + "A": 0.0009833724470809102, + "B": 0.004140153061598539, + "C": 0.0647628903388977, + "D": 0.8940240740776062 }, "sample": { "messages": [ @@ -6487,10 +6487,10 @@ ] }, "predict": { - "A": 0.0027035363018512726, - "B": 0.962525486946106, - "C": 0.0003033283574040979, - "D": 0.003933621570467949 + "A": 0.0055324239656329155, + "B": 0.9304106831550598, + "C": 0.0004266147443559021, + "D": 0.006269058212637901 }, "sample": { "messages": [ @@ -6525,17 +6525,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "A" + "C" ] }, "predict": { - "A": 0.9010331630706787, - "B": 0.003249633125960827, - "C": 0.05083281546831131, - "D": 0.003249633125960827 + "A": 0.346391886472702, + "B": 0.0008586195763200521, + "C": 0.5711036324501038, + "D": 0.0007577291107736528 }, "sample": { "messages": [ @@ -6565,22 +6565,22 @@ "prompt_len": 89, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.003217293182387948, - "B": 0.7872456908226013, - "C": 0.06462106108665466, - "D": 0.1207280382514 + "A": 0.0032861491199582815, + "B": 0.11584076285362244, + "C": 0.45815908908843994, + "D": 0.4043239653110504 }, "sample": { "messages": [ @@ -6610,7 +6610,7 @@ "prompt_len": 69, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -6622,10 +6622,10 @@ ] }, "predict": { - "A": 0.04409314692020416, - "B": 0.04996408522129059, - "C": 0.8856345415115356, - "D": 0.001331497565843165 + "A": 0.005810275673866272, + "B": 0.00399334030225873, + "C": 0.9771381616592407, + "D": 0.0002892766206059605 }, "sample": { "messages": [ @@ -6667,10 +6667,10 @@ ] }, "predict": { - "A": 0.010718206875026226, - "B": 0.015594897791743279, - "C": 0.9648221731185913, - "D": 0.00041558995144441724 + "A": 0.002778064925223589, + "B": 0.0007477059843949974, + "C": 0.989059567451477, + "D": 8.930075273383409e-05 }, "sample": { "messages": [ @@ -6712,10 +6712,10 @@ ] }, "predict": { - "A": 0.10386619716882706, - "B": 0.009661061689257622, - "C": 0.8696609735488892, - "D": 0.00042447741725482047 + "A": 0.08139614760875702, + "B": 0.0031560712959617376, + "C": 0.8750910758972168, + "D": 0.0002935603552032262 }, "sample": { "messages": [ @@ -6757,10 +6757,10 @@ ] }, "predict": { - "A": 0.9666714072227478, - "B": 0.0014533315552398562, - "C": 9.290838352171704e-05, - "D": 0.0005346508114598691 + "A": 0.9038349390029907, + "B": 0.00019576246268115938, + "C": 2.6493569748708978e-05, + "D": 0.00016229278116952628 }, "sample": { "messages": [ @@ -6802,10 +6802,10 @@ ] }, "predict": { - "A": 0.964509129524231, - "B": 0.0034785631578415632, - "C": 0.00025198626099154353, - "D": 0.0012796918163076043 + "A": 0.912966251373291, + "B": 0.00106897356454283, + "C": 0.00011266898945905268, + "D": 0.00044561451068148017 }, "sample": { "messages": [ @@ -6847,10 +6847,10 @@ ] }, "predict": { - "A": 0.0014614106621593237, - "B": 0.9720450639724731, - "C": 8.776453614700586e-05, - "D": 0.0004744506150018424 + "A": 0.0017146216705441475, + "B": 0.8881959915161133, + "C": 7.077088230289519e-05, + "D": 0.0004912473377771676 }, "sample": { "messages": [ @@ -6892,10 +6892,10 @@ ] }, "predict": { - "A": 0.7252323627471924, - "B": 0.024816084653139114, - "C": 0.03186448663473129, - "D": 0.18336744606494904 + "A": 0.8442294001579285, + "B": 0.00222759903408587, + "C": 0.010627279989421368, + "D": 0.03709285333752632 }, "sample": { "messages": [ @@ -6937,10 +6937,10 @@ ] }, "predict": { - "A": 0.004008793737739325, - "B": 0.9809194207191467, - "C": 0.0013014646247029305, - "D": 0.0006966238142922521 + "A": 0.0037635662592947483, + "B": 0.9209142327308655, + "C": 0.0008397651254199445, + "D": 0.00078888627467677 }, "sample": { "messages": [ @@ -6982,10 +6982,10 @@ ] }, "predict": { - "A": 0.055285628885030746, - "B": 0.007482096552848816, - "C": 0.04878940060734749, - "D": 0.8648127913475037 + "A": 0.06292334944009781, + "B": 0.00276465923525393, + "C": 0.03368044272065163, + "D": 0.868630051612854 }, "sample": { "messages": [ @@ -7027,10 +7027,10 @@ ] }, "predict": { - "A": 0.11333148181438446, - "B": 0.011945049278438091, - "C": 0.8374126553535461, - "D": 0.005642442032694817 + "A": 0.08862629532814026, + "B": 0.003436414524912834, + "C": 0.8408628702163696, + "D": 0.0023618109989911318 }, "sample": { "messages": [ @@ -7072,10 +7072,10 @@ ] }, "predict": { - "A": 0.9574280977249146, - "B": 0.005693075712770224, - "C": 0.00046731613110750914, - "D": 0.0014394349418580532 + "A": 0.903498649597168, + "B": 0.0004997109645046294, + "C": 0.00010474511509528384, + "D": 0.00023604673333466053 }, "sample": { "messages": [ @@ -7117,10 +7117,10 @@ ] }, "predict": { - "A": 0.01223623938858509, - "B": 0.006549586541950703, - "C": 0.972044825553894, - "D": 0.0030938058625906706 + "A": 0.013914680108428001, + "B": 0.0021338853985071182, + "C": 0.975494384765625, + "D": 0.0021338853985071182 }, "sample": { "messages": [ @@ -7162,10 +7162,10 @@ ] }, "predict": { - "A": 0.001673118444159627, - "B": 0.9820960760116577, - "C": 0.0001373379200231284, - "D": 0.0006974595016799867 + "A": 0.0007170370081439614, + "B": 0.8910247087478638, + "C": 0.0001032989748637192, + "D": 0.0003605488163884729 }, "sample": { "messages": [ @@ -7207,10 +7207,10 @@ ] }, "predict": { - "A": 0.4622841477394104, - "B": 0.3600272536277771, - "C": 0.026080287992954254, - "D": 0.05521196871995926 + "A": 0.5895159244537354, + "B": 0.10244250297546387, + "C": 0.010797361843287945, + "D": 0.033258214592933655 }, "sample": { "messages": [ @@ -7252,10 +7252,10 @@ ] }, "predict": { - "A": 0.0002007259172387421, - "B": 0.9865214824676514, - "C": 0.0003750053874682635, - "D": 0.00048151647206395864 + "A": 0.00010606972500681877, + "B": 0.9149243235588074, + "C": 0.00021094482508488, + "D": 0.00023903178225737065 }, "sample": { "messages": [ @@ -7297,10 +7297,10 @@ ] }, "predict": { - "A": 0.37518706917762756, - "B": 0.25786206126213074, - "C": 0.17722582817077637, - "D": 0.12180540710687637 + "A": 0.2663206458091736, + "B": 0.16153164207935333, + "C": 0.23502713441848755, + "D": 0.23502713441848755 }, "sample": { "messages": [ @@ -7342,10 +7342,10 @@ ] }, "predict": { - "A": 0.00048204674385488033, - "B": 0.00037541837082244456, - "C": 0.00027466192841529846, - "D": 0.9876078963279724 + "A": 0.0003208886191714555, + "B": 6.726191350026056e-05, + "C": 0.00017175929679069668, + "D": 0.9565554857254028 }, "sample": { "messages": [ @@ -7387,10 +7387,10 @@ ] }, "predict": { - "A": 0.009460617788136005, - "B": 0.01559792086482048, - "C": 0.9650092124938965, - "D": 0.0005337314796634018 + "A": 0.008387862704694271, + "B": 0.008387862704694271, + "C": 0.9695051312446594, + "D": 0.0005362181109376252 }, "sample": { "messages": [ @@ -7432,10 +7432,10 @@ ] }, "predict": { - "A": 0.0004248044278938323, - "B": 0.0004248044278938323, - "C": 0.9862142205238342, - "D": 0.0011547381291165948 + "A": 0.00012986271758563817, + "B": 0.00014715372526552528, + "C": 0.9885334372520447, + "D": 0.00020113529171794653 }, "sample": { "messages": [ @@ -7477,10 +7477,10 @@ ] }, "predict": { - "A": 0.13920578360557556, - "B": 0.01467218343168497, - "C": 0.8010740280151367, - "D": 0.0015464368043467402 + "A": 0.14708249270915985, + "B": 0.006462354212999344, + "C": 0.7469465732574463, + "D": 0.0009309904417023063 }, "sample": { "messages": [ @@ -7522,10 +7522,10 @@ ] }, "predict": { - "A": 0.000257595325820148, - "B": 0.00020061545365024358, - "C": 2.396009767835494e-05, - "D": 0.9859785437583923 + "A": 0.00011925557919312268, + "B": 6.383290747180581e-05, + "C": 2.2060065020923503e-05, + "D": 0.9663379788398743 }, "sample": { "messages": [ @@ -7567,10 +7567,10 @@ ] }, "predict": { - "A": 0.0019969413988292217, - "B": 0.0008324490045197308, - "C": 0.0030929234344512224, - "D": 0.9717676043510437 + "A": 0.00020479261002037674, + "B": 7.077444024616852e-05, + "C": 0.00010961759835481644, + "D": 0.9455271363258362 }, "sample": { "messages": [ @@ -7612,10 +7612,10 @@ ] }, "predict": { - "A": 0.00025646676658652723, - "B": 0.9816588759422302, - "C": 7.347897189902142e-05, - "D": 0.0010143457911908627 + "A": 0.00014737271703779697, + "B": 0.9300232529640198, + "C": 4.784491829923354e-05, + "D": 0.000660478719510138 }, "sample": { "messages": [ @@ -7657,10 +7657,10 @@ ] }, "predict": { - "A": 0.0008843845571391284, - "B": 0.006534766871482134, - "C": 0.9698454141616821, - "D": 0.01776333898305893 + "A": 0.0001772247487679124, + "B": 0.001019858056679368, + "C": 0.9869935512542725, + "D": 0.0040336172096431255 }, "sample": { "messages": [ @@ -7702,10 +7702,10 @@ ] }, "predict": { - "A": 0.9686682820320129, - "B": 0.0008833110914565623, - "C": 0.00022333601373247802, - "D": 0.0004172466287855059 + "A": 0.8665341138839722, + "B": 0.00014616800763178617, + "C": 5.724021320929751e-05, + "D": 0.00011383576929802075 }, "sample": { "messages": [ @@ -7747,10 +7747,10 @@ ] }, "predict": { - "A": 0.19384939968585968, - "B": 0.7666893005371094, - "C": 0.0024402039125561714, - "D": 0.002765113255009055 + "A": 0.4147467315196991, + "B": 0.46996963024139404, + "C": 0.0024661743082106113, + "D": 0.0009657676564529538 }, "sample": { "messages": [ @@ -7792,10 +7792,10 @@ ] }, "predict": { - "A": 0.5677024126052856, - "B": 0.30386924743652344, - "C": 0.0465998537838459, - "D": 0.02494310401380062 + "A": 0.6429242491722107, + "B": 0.18420088291168213, + "C": 0.024928877130150795, + "D": 0.01941462978720665 }, "sample": { "messages": [ @@ -7837,10 +7837,10 @@ ] }, "predict": { - "A": 0.006574505474418402, - "B": 0.9757431149482727, - "C": 0.00019853285630233586, - "D": 0.00028886363725177944 + "A": 0.006966990418732166, + "B": 0.9124957323074341, + "C": 0.00018566401558928192, + "D": 0.0007816746947355568 }, "sample": { "messages": [ @@ -7882,10 +7882,10 @@ ] }, "predict": { - "A": 0.002707574050873518, - "B": 0.003939496818929911, - "C": 0.0018608864629641175, - "D": 0.9639630317687988 + "A": 0.0005672867991961539, + "B": 0.0005672867991961539, + "C": 0.00038989013410173357, + "D": 0.9635359048843384 }, "sample": { "messages": [ @@ -7927,10 +7927,10 @@ ] }, "predict": { - "A": 0.004826574586331844, - "B": 0.05879971385002136, - "C": 0.002012015786021948, - "D": 0.9197823405265808 + "A": 0.0011932413326576352, + "B": 0.003047048579901457, + "C": 0.0004672800423577428, + "D": 0.9573541283607483 }, "sample": { "messages": [ @@ -7972,10 +7972,10 @@ ] }, "predict": { - "A": 0.0058321417309343815, - "B": 0.0004497265035752207, - "C": 7.815074786776677e-05, - "D": 0.9808155298233032 + "A": 0.007393778767436743, + "B": 0.00022327277110889554, + "C": 8.213746332330629e-05, + "D": 0.9683939218521118 }, "sample": { "messages": [ @@ -8017,10 +8017,10 @@ ] }, "predict": { - "A": 0.003922022879123688, - "B": 0.959687352180481, - "C": 0.003054474713280797, - "D": 0.0026955644134432077 + "A": 0.0018528025830164552, + "B": 0.9016256928443909, + "C": 0.0009316476644016802, + "D": 0.0028696770314127207 }, "sample": { "messages": [ @@ -8062,10 +8062,10 @@ ] }, "predict": { - "A": 0.003863954683765769, - "B": 0.01962278038263321, - "C": 0.003863954683765769, - "D": 0.9454785585403442 + "A": 0.0015234120655804873, + "B": 0.0013444063952192664, + "C": 0.0020822572987526655, + "D": 0.9518929719924927 }, "sample": { "messages": [ @@ -8107,10 +8107,10 @@ ] }, "predict": { - "A": 0.003996888641268015, - "B": 0.9780063629150391, - "C": 0.0012975996360182762, - "D": 0.0018879964482039213 + "A": 0.0033408310264348984, + "B": 0.9263198375701904, + "C": 0.0007002762285992503, + "D": 0.002026316476985812 }, "sample": { "messages": [ @@ -8152,10 +8152,10 @@ ] }, "predict": { - "A": 0.7670194506645203, - "B": 0.06296078860759735, - "C": 0.13328798115253448, - "D": 0.005168136674910784 + "A": 0.8880743980407715, + "B": 0.00870637595653534, + "C": 0.026817524805665016, + "D": 0.0017143869772553444 }, "sample": { "messages": [ @@ -8197,10 +8197,10 @@ ] }, "predict": { - "A": 0.8083279132843018, - "B": 0.02440938726067543, - "C": 0.10939528793096542, - "D": 0.0276594590395689 + "A": 0.9190604090690613, + "B": 0.0016667103627696633, + "C": 0.007951430045068264, + "D": 0.0037559899501502514 }, "sample": { "messages": [ @@ -8242,10 +8242,10 @@ ] }, "predict": { - "A": 0.007601063698530197, - "B": 0.38986021280288696, - "C": 0.567243218421936, - "D": 0.0031685950234532356 + "A": 0.003263935213908553, + "B": 0.02265620045363903, + "C": 0.9633661508560181, + "D": 0.001641209819354117 }, "sample": { "messages": [ @@ -8287,10 +8287,10 @@ ] }, "predict": { - "A": 0.002841853303834796, - "B": 0.0026696741115301847, - "C": 0.06468028575181961, - "D": 0.8928837776184082 + "A": 0.0013387090293690562, + "B": 0.0008119679987430573, + "C": 0.08816461265087128, + "D": 0.836482584476471 }, "sample": { "messages": [ @@ -8325,17 +8325,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.2643863558769226, - "B": 0.6342298984527588, - "C": 0.040544960647821426, - "D": 0.01491565816104412 + "A": 0.7140478491783142, + "B": 0.1805395781993866, + "C": 0.016792796552181244, + "D": 0.002272658050060272 }, "sample": { "messages": [ @@ -8365,7 +8365,7 @@ "prompt_len": 83, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -8377,10 +8377,10 @@ ] }, "predict": { - "A": 0.001687613083049655, - "B": 0.001687613083049655, - "C": 0.9906042218208313, - "D": 0.0006208380800671875 + "A": 0.001903720898553729, + "B": 0.0005454251659102738, + "C": 0.9861518740653992, + "D": 0.00022736703976988792 }, "sample": { "messages": [ @@ -8422,10 +8422,10 @@ ] }, "predict": { - "A": 0.003116519656032324, - "B": 0.9791812896728516, - "C": 0.0001758219877956435, - "D": 0.0001758219877956435 + "A": 0.0012085047783330083, + "B": 0.9108551740646362, + "C": 4.135275958105922e-05, + "D": 0.00013559048238676041 }, "sample": { "messages": [ @@ -8467,10 +8467,10 @@ ] }, "predict": { - "A": 0.001016685157082975, - "B": 0.9839228987693787, - "C": 4.7551002353429794e-05, - "D": 0.0005441923858597875 + "A": 0.001238142023794353, + "B": 0.9331929087638855, + "C": 2.9118311431375332e-05, + "D": 0.00020212112576700747 }, "sample": { "messages": [ @@ -8505,17 +8505,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "B", - "A" + "B" ] }, "predict": { - "A": 0.47227147221565247, - "B": 0.3678053915500641, - "C": 0.000552973011508584, - "D": 0.11940892785787582 + "A": 0.08026084303855896, + "B": 0.8628854155540466, + "C": 0.0003280077362433076, + "D": 0.017908615991473198 }, "sample": { "messages": [ @@ -8545,7 +8545,7 @@ "prompt_len": 67, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " B" } } { @@ -8557,10 +8557,10 @@ ] }, "predict": { - "A": 0.9496214389801025, - "B": 0.003424869617447257, - "C": 0.0020772884599864483, - "D": 0.0011118922848254442 + "A": 0.9014610648155212, + "B": 0.0007722220034338534, + "C": 0.00197193818166852, + "D": 0.00025070380070246756 }, "sample": { "messages": [ @@ -8602,10 +8602,10 @@ ] }, "predict": { - "A": 0.012166137807071209, - "B": 0.00651206448674202, - "C": 0.9664760231971741, - "D": 0.00028612016467377543 + "A": 0.004497753921896219, + "B": 0.0009427802870050073, + "C": 0.9712443351745605, + "D": 8.769223495619372e-05 }, "sample": { "messages": [ @@ -8647,10 +8647,10 @@ ] }, "predict": { - "A": 0.013733317144215107, - "B": 0.010695519857108593, - "C": 0.9627799391746521, - "D": 0.0007747808122076094 + "A": 0.002436724491417408, + "B": 0.000791088561527431, + "C": 0.9830447435379028, + "D": 0.00016582116950303316 }, "sample": { "messages": [ @@ -8692,10 +8692,10 @@ ] }, "predict": { - "A": 0.9476543664932251, - "B": 0.01735689491033554, - "C": 0.0007626087754033506, - "D": 0.006385244894772768 + "A": 0.7752026915550232, + "B": 0.001320650801062584, + "C": 0.00017873062461148947, + "D": 0.0005505291046574712 }, "sample": { "messages": [ @@ -8737,10 +8737,10 @@ ] }, "predict": { - "A": 0.007419073488563299, - "B": 0.005777980200946331, - "C": 0.0008323970250785351, - "D": 0.9717069268226624 + "A": 0.0006783453281968832, + "B": 0.0004662194987758994, + "C": 0.00015135930152609944, + "D": 0.955181360244751 }, "sample": { "messages": [ @@ -8782,10 +8782,10 @@ ] }, "predict": { - "A": 0.9421868324279785, - "B": 0.002061025472357869, - "C": 0.00040583996451459825, - "D": 0.0002789294521789998 + "A": 0.834730327129364, + "B": 0.00029808058752678335, + "C": 0.00010965773253701627, + "D": 4.866032031713985e-05 }, "sample": { "messages": [ @@ -8827,10 +8827,10 @@ ] }, "predict": { - "A": 0.0003332376654725522, - "B": 0.0007054641610011458, - "C": 0.9933674931526184, - "D": 0.00037760776467621326 + "A": 3.084420313825831e-05, + "B": 3.96047362301033e-05, + "C": 0.9885048270225525, + "D": 2.4021488570724614e-05 }, "sample": { "messages": [ @@ -8872,10 +8872,10 @@ ] }, "predict": { - "A": 0.02525666169822216, - "B": 0.9477491974830627, - "C": 0.0063858842477202415, - "D": 0.00142488325946033 + "A": 0.0039007430896162987, + "B": 0.9544803500175476, + "C": 0.00038622584543190897, + "D": 0.00036282563814893365 }, "sample": { "messages": [ @@ -8913,14 +8913,14 @@ "acc": false, "f1_macro": [ "D", - "B" + "A" ] }, "predict": { - "A": 0.2631323039531708, - "B": 0.5570511221885681, - "C": 0.013100586831569672, - "D": 0.14084456861019135 + "A": 0.48791974782943726, + "B": 0.05827370285987854, + "C": 0.016695694997906685, + "D": 0.37999227643013 }, "sample": { "messages": [ @@ -8950,7 +8950,7 @@ "prompt_len": 77, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -8962,10 +8962,10 @@ ] }, "predict": { - "A": 0.0007777793798595667, - "B": 0.9665061235427856, - "C": 0.000998688512481749, - "D": 0.000998688512481749 + "A": 0.0006489515071734786, + "B": 0.858427882194519, + "C": 0.0005054039065726101, + "D": 0.00069080526009202 }, "sample": { "messages": [ @@ -9007,10 +9007,10 @@ ] }, "predict": { - "A": 0.9142619967460632, - "B": 0.035449784249067307, - "C": 0.013041246682405472, - "D": 0.002567973919212818 + "A": 0.9093900918960571, + "B": 0.0032797728199511766, + "C": 0.001206560991704464, + "D": 0.0006874777609482408 }, "sample": { "messages": [ @@ -9052,10 +9052,10 @@ ] }, "predict": { - "A": 0.9680305123329163, - "B": 0.003491263370960951, - "C": 0.00047249114140868187, - "D": 0.0011334471637383103 + "A": 0.8797569870948792, + "B": 0.000355989410309121, + "C": 0.00013096118345856667, + "D": 0.0002951255883090198 }, "sample": { "messages": [ @@ -9097,10 +9097,10 @@ ] }, "predict": { - "A": 0.00020286501967348158, - "B": 0.0002298761683050543, - "C": 0.9970346689224243, - "D": 0.00010858582390937954 + "A": 2.9170305424486287e-05, + "B": 3.1051629775902256e-05, + "C": 0.995152473449707, + "D": 1.8833763533621095e-05 }, "sample": { "messages": [ @@ -9135,17 +9135,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.44260114431381226, - "B": 0.013365396298468113, - "C": 0.01716150902211666, - "D": 0.501532793045044 + "A": 0.7568364143371582, + "B": 0.0025642013642936945, + "C": 0.012233120389282703, + "D": 0.19135820865631104 }, "sample": { "messages": [ @@ -9175,7 +9175,7 @@ "prompt_len": 79, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -9187,10 +9187,10 @@ ] }, "predict": { - "A": 0.43332403898239136, - "B": 0.3824070990085602, - "C": 0.051753174513578415, - "D": 0.08532655239105225 + "A": 0.4790736436843872, + "B": 0.17624135315418243, + "C": 0.027027485892176628, + "D": 0.25642964243888855 }, "sample": { "messages": [ @@ -9232,10 +9232,10 @@ ] }, "predict": { - "A": 0.0019286557799205184, - "B": 0.0021854531951248646, - "C": 0.8816747665405273, - "D": 0.10530103743076324 + "A": 0.0003755484940484166, + "B": 0.0003314203640911728, + "C": 0.9879502058029175, + "D": 0.002774948952719569 }, "sample": { "messages": [ @@ -9277,10 +9277,10 @@ ] }, "predict": { - "A": 0.9406440258026123, - "B": 0.0012480281293392181, - "C": 0.0012480281293392181, - "D": 0.007181905210018158 + "A": 0.8622968196868896, + "B": 0.0003277840150985867, + "C": 0.0006518763257190585, + "D": 0.002578220795840025 }, "sample": { "messages": [ @@ -9322,10 +9322,10 @@ ] }, "predict": { - "A": 0.0009922379394993186, - "B": 0.9602634310722351, - "C": 0.001124352915212512, - "D": 0.0030563082545995712 + "A": 0.0029766284860670567, + "B": 0.8785662055015564, + "C": 0.0029766284860670567, + "D": 0.003590499283745885 }, "sample": { "messages": [ @@ -9367,10 +9367,10 @@ ] }, "predict": { - "A": 0.9696336388587952, - "B": 0.002723501529544592, - "C": 0.0024034816306084394, - "D": 0.004490294959396124 + "A": 0.8186271786689758, + "B": 0.004867734853178263, + "C": 0.05930115655064583, + "D": 0.0037909962702542543 }, "sample": { "messages": [ @@ -9412,10 +9412,10 @@ ] }, "predict": { - "A": 0.00042344946996308863, - "B": 0.983068585395813, - "C": 0.00012132029951317236, - "D": 0.00012132029951317236 + "A": 0.00014616524276789278, + "B": 0.9224033355712891, + "C": 0.00016562693053856492, + "D": 0.00014616524276789278 }, "sample": { "messages": [ @@ -9457,10 +9457,10 @@ ] }, "predict": { - "A": 0.0007003386854194105, - "B": 0.0003308165760245174, - "C": 0.986150324344635, - "D": 0.004030171316117048 + "A": 0.00048028930905275047, + "B": 9.457456326344982e-05, + "C": 0.9840072989463806, + "D": 0.0008972985087893903 }, "sample": { "messages": [ @@ -9502,10 +9502,10 @@ ] }, "predict": { - "A": 0.0030722690280526876, - "B": 0.01072327233850956, - "C": 0.965278148651123, - "D": 0.0008802197407931089 + "A": 0.00022528573754243553, + "B": 0.0018862944561988115, + "C": 0.9771247506141663, + "D": 0.0002398154465481639 }, "sample": { "messages": [ @@ -9547,10 +9547,10 @@ ] }, "predict": { - "A": 0.013888362795114517, - "B": 0.0024134356062859297, - "C": 0.9736494421958923, - "D": 0.0007835278520360589 + "A": 0.010828672908246517, + "B": 0.0007369003724306822, + "C": 0.9747660160064697, + "D": 0.00047577868099324405 }, "sample": { "messages": [ @@ -9585,17 +9585,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "C" ] }, "predict": { - "A": 0.009732433594763279, - "B": 0.06346344202756882, - "C": 0.2215091735124588, - "D": 0.6822963356971741 + "A": 0.015066198073327541, + "B": 0.05258619785308838, + "C": 0.440298855304718, + "D": 0.440298855304718 }, "sample": { "messages": [ @@ -9625,7 +9625,7 @@ "prompt_len": 90, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " C" } } { @@ -9637,10 +9637,10 @@ ] }, "predict": { - "A": 0.009575623087584972, - "B": 0.003522674785926938, - "C": 0.9767400622367859, - "D": 0.0021366102155297995 + "A": 0.005837818142026663, + "B": 0.0005101028946228325, + "C": 0.9817700982093811, + "D": 0.00027303840033710003 }, "sample": { "messages": [ @@ -9682,10 +9682,10 @@ ] }, "predict": { - "A": 0.13030757009983063, - "B": 0.6617565155029297, - "C": 0.13030757009983063, - "D": 0.02907555177807808 + "A": 0.07736057043075562, + "B": 0.7339767217636108, + "C": 0.11255897581577301, + "D": 0.007659734692424536 }, "sample": { "messages": [ @@ -9727,10 +9727,10 @@ ] }, "predict": { - "A": 0.005280815996229649, - "B": 0.2883228063583374, - "C": 0.6916504502296448, - "D": 0.0004911924479529262 + "A": 0.002718409290537238, + "B": 0.02008647657930851, + "C": 0.9678207039833069, + "D": 0.00011943855497520417 }, "sample": { "messages": [ @@ -9772,10 +9772,10 @@ ] }, "predict": { - "A": 0.031551558524370193, - "B": 0.003325509838759899, - "C": 0.019136987626552582, - "D": 0.9220716953277588 + "A": 0.09010784327983856, + "B": 0.0009403597214259207, + "C": 0.00395906250923872, + "D": 0.85491943359375 }, "sample": { "messages": [ @@ -9817,10 +9817,10 @@ ] }, "predict": { - "A": 0.00443674810230732, - "B": 0.9580707550048828, - "C": 0.012060331180691719, - "D": 0.005027493927627802 + "A": 0.0022837312426418066, + "B": 0.9213228821754456, + "C": 0.0013851529220119119, + "D": 0.0013851529220119119 }, "sample": { "messages": [ @@ -9862,10 +9862,10 @@ ] }, "predict": { - "A": 0.001022864249534905, - "B": 0.0003320754040032625, - "C": 0.0004831668920814991, - "D": 0.9899028539657593 + "A": 0.00017659559671301395, + "B": 4.753006578539498e-05, + "C": 0.0002267532399855554, + "D": 0.9834896326065063 }, "sample": { "messages": [ @@ -9907,10 +9907,10 @@ ] }, "predict": { - "A": 0.7045314311981201, - "B": 0.035076554864645004, - "C": 0.0006424495368264616, - "D": 0.22872787714004517 + "A": 0.8916540145874023, + "B": 0.004129177890717983, + "C": 0.00023295222490560263, + "D": 0.07319141924381256 }, "sample": { "messages": [ @@ -9952,10 +9952,10 @@ ] }, "predict": { - "A": 0.05963075906038284, - "B": 0.7264513969421387, - "C": 0.03191804513335228, - "D": 0.16209322214126587 + "A": 0.045015618205070496, + "B": 0.5484024882316589, + "C": 0.013728988356888294, + "D": 0.3769111633300781 }, "sample": { "messages": [ @@ -9997,10 +9997,10 @@ ] }, "predict": { - "A": 0.9783462882041931, - "B": 0.0005411080201156437, - "C": 0.00010655049118213356, - "D": 0.00022556737530976534 + "A": 0.9396186470985413, + "B": 7.486820686608553e-05, + "C": 2.5873761842376553e-05, + "D": 3.5365239455131814e-05 }, "sample": { "messages": [ @@ -10042,10 +10042,10 @@ ] }, "predict": { - "A": 0.00022923198412172496, - "B": 0.0002597538405098021, - "C": 0.9942406415939331, - "D": 6.567606033058837e-05 + "A": 7.39536335458979e-05, + "B": 3.0828457965981215e-05, + "C": 0.988000214099884, + "D": 1.5501522284466773e-05 }, "sample": { "messages": [ @@ -10087,10 +10087,10 @@ ] }, "predict": { - "A": 0.009512760676443577, - "B": 0.0018731735181063414, - "C": 0.00030578719452023506, - "D": 0.9703279137611389 + "A": 0.004964939784258604, + "B": 0.0001698908890830353, + "C": 9.093604603549466e-05, + "D": 0.9461500644683838 }, "sample": { "messages": [ @@ -10132,10 +10132,10 @@ ] }, "predict": { - "A": 0.00392504595220089, - "B": 0.9604270458221436, - "C": 0.0005311971763148904, - "D": 0.0011245444184169173 + "A": 0.0031853767577558756, + "B": 0.8832166194915771, + "C": 0.0013278624974191189, + "D": 0.0019320285646244884 }, "sample": { "messages": [ @@ -10177,10 +10177,10 @@ ] }, "predict": { - "A": 0.036672502756118774, - "B": 0.004379906225949526, - "C": 0.9457963109016418, - "D": 0.0006716803181916475 + "A": 0.017555611208081245, + "B": 0.0004128683649469167, + "C": 0.9585038423538208, + "D": 0.00010438948083901778 }, "sample": { "messages": [ @@ -10222,10 +10222,10 @@ ] }, "predict": { - "A": 0.001022334792651236, - "B": 0.0014874882763251662, - "C": 0.9893904328346252, - "D": 0.0007961951196193695 + "A": 0.00037706951843574643, + "B": 0.00015718594659119844, + "C": 0.991951584815979, + "D": 0.00013031174603383988 }, "sample": { "messages": [ @@ -10267,10 +10267,10 @@ ] }, "predict": { - "A": 0.005840929225087166, - "B": 0.9822933077812195, - "C": 0.0005432910402305424, - "D": 0.0013032875722274184 + "A": 0.006477350369095802, + "B": 0.9613240361213684, + "C": 0.0002082140854327008, + "D": 0.000682707701344043 }, "sample": { "messages": [ @@ -10312,10 +10312,10 @@ ] }, "predict": { - "A": 0.7935203909873962, - "B": 0.023962240666151047, - "C": 0.006865296512842178, - "D": 0.08363643288612366 + "A": 0.8214138150215149, + "B": 0.0023071824107319117, + "C": 0.0007973408210091293, + "D": 0.007106621749699116 }, "sample": { "messages": [ @@ -10357,10 +10357,10 @@ ] }, "predict": { - "A": 0.8440103530883789, - "B": 0.06928058713674545, - "C": 0.028880445286631584, - "D": 0.004428959917277098 + "A": 0.828447699546814, + "B": 0.02207738347351551, + "C": 0.032122403383255005, + "D": 0.0017024249536916614 }, "sample": { "messages": [ @@ -10402,10 +10402,10 @@ ] }, "predict": { - "A": 0.013329321518540382, - "B": 0.036232851445674896, - "C": 0.0008521150448359549, - "D": 0.9344576001167297 + "A": 0.01039037574082613, + "B": 0.005561566445976496, + "C": 0.00042886199662461877, + "D": 0.9353117346763611 }, "sample": { "messages": [ @@ -10447,10 +10447,10 @@ ] }, "predict": { - "A": 0.08334122598171234, - "B": 0.8960026502609253, - "C": 0.00011057550727855414, - "D": 0.0003405965690035373 + "A": 0.07991313189268112, + "B": 0.8591471314430237, + "C": 6.430874054785818e-05, + "D": 0.0002882117696572095 }, "sample": { "messages": [ @@ -10492,10 +10492,10 @@ ] }, "predict": { - "A": 0.0014711078256368637, - "B": 0.9784950613975525, - "C": 0.0012982480693608522, - "D": 0.0010110766161233187 + "A": 0.0009422646835446358, + "B": 0.9119005799293518, + "C": 0.000504357973113656, + "D": 0.000504357973113656 }, "sample": { "messages": [ @@ -10537,10 +10537,10 @@ ] }, "predict": { - "A": 0.004563693422824144, - "B": 0.0975760668516159, - "C": 0.4373055696487427, - "D": 0.4373055696487427 + "A": 0.0016517284093424678, + "B": 0.01567116193473339, + "C": 0.5880560278892517, + "D": 0.3566740155220032 }, "sample": { "messages": [ @@ -10582,10 +10582,10 @@ ] }, "predict": { - "A": 0.003138785483315587, - "B": 0.0014826571568846703, - "C": 0.9861770272254944, - "D": 0.0013084403472021222 + "A": 0.002429278800264001, + "B": 0.0002725575177464634, + "C": 0.9800410270690918, + "D": 0.00037254198105074465 }, "sample": { "messages": [ @@ -10627,10 +10627,10 @@ ] }, "predict": { - "A": 0.1196407899260521, - "B": 0.03884167596697807, - "C": 0.7801559567451477, - "D": 0.016191620379686356 + "A": 0.2030148059129715, + "B": 0.018883317708969116, + "C": 0.7085912823677063, + "D": 0.014706341549754143 }, "sample": { "messages": [ @@ -10672,10 +10672,10 @@ ] }, "predict": { - "A": 0.01359811332076788, - "B": 0.00934583880007267, - "C": 0.9533013701438904, - "D": 0.010590222664177418 + "A": 0.002939174883067608, + "B": 0.0009542102925479412, + "C": 0.9830194115638733, + "D": 0.0051584127359092236 }, "sample": { "messages": [ @@ -10717,10 +10717,10 @@ ] }, "predict": { - "A": 0.04614056646823883, - "B": 0.011666161939501762, - "C": 0.9267580509185791, - "D": 0.0017890639137476683 + "A": 0.022424276918172836, + "B": 0.002678198041394353, + "C": 0.9535044431686401, + "D": 0.0004654010117519647 }, "sample": { "messages": [ @@ -10762,10 +10762,10 @@ ] }, "predict": { - "A": 0.0010190103203058243, - "B": 0.11778157949447632, - "C": 0.8702946901321411, - "D": 0.0008992734365165234 + "A": 0.00017747518722899258, + "B": 0.00587717117741704, + "C": 0.9883882403373718, + "D": 4.215404987917282e-05 }, "sample": { "messages": [ @@ -10807,10 +10807,10 @@ ] }, "predict": { - "A": 0.0030933781526982784, - "B": 0.9719104766845703, - "C": 0.00088626763317734, - "D": 0.0011379901552572846 + "A": 0.002201465191319585, + "B": 0.8881344199180603, + "C": 0.000976894167251885, + "D": 0.005281039979308844 }, "sample": { "messages": [ @@ -10852,10 +10852,10 @@ ] }, "predict": { - "A": 0.02997628040611744, - "B": 0.5313422679901123, - "C": 0.0031594764441251755, - "D": 0.41380977630615234 + "A": 0.02094159461557865, + "B": 0.7858275771141052, + "C": 0.0018298561917617917, + "D": 0.1547386199235916 }, "sample": { "messages": [ @@ -10897,10 +10897,10 @@ ] }, "predict": { - "A": 0.003549112705513835, - "B": 0.003549112705513835, - "C": 0.9840705394744873, - "D": 0.0011522280983626842 + "A": 0.0024457431863993406, + "B": 0.0007007171516306698, + "C": 0.986683189868927, + "D": 0.00027440476696938276 }, "sample": { "messages": [ @@ -10942,10 +10942,10 @@ ] }, "predict": { - "A": 0.006548524834215641, - "B": 0.9718872904777527, - "C": 0.0006091076647862792, - "D": 0.0014611734077334404 + "A": 0.011569647118449211, + "B": 0.9190909266471863, + "C": 0.0008921553380787373, + "D": 0.0029252651147544384 }, "sample": { "messages": [ @@ -10987,10 +10987,10 @@ ] }, "predict": { - "A": 0.009583566337823868, - "B": 0.005129713099449873, - "C": 0.9775502681732178, - "D": 0.0005406677373684943 + "A": 0.005154064856469631, + "B": 0.0010148956207558513, + "C": 0.9821909666061401, + "D": 0.00017636240227147937 }, "sample": { "messages": [ @@ -11032,10 +11032,10 @@ ] }, "predict": { - "A": 0.002682407619431615, - "B": 0.022459523752331734, - "C": 0.003444279544055462, - "D": 0.9550032019615173 + "A": 0.0015994023997336626, + "B": 0.0020536731462925673, + "C": 0.0031807913910597563, + "D": 0.9388258457183838 }, "sample": { "messages": [ @@ -11077,10 +11077,10 @@ ] }, "predict": { - "A": 0.0026910468004643917, - "B": 0.9580789804458618, - "C": 0.0011217951541766524, - "D": 0.00032139968243427575 + "A": 0.006883213762193918, + "B": 0.9015231728553772, + "C": 0.002869350602850318, + "D": 0.0003426950715947896 }, "sample": { "messages": [ @@ -11122,10 +11122,10 @@ ] }, "predict": { - "A": 0.5563988089561462, - "B": 0.12414935976266861, - "C": 0.2628242075443268, - "D": 0.024446457624435425 + "A": 0.684053897857666, + "B": 0.018229417502880096, + "C": 0.08169858902692795, + "D": 0.007599152624607086 }, "sample": { "messages": [ @@ -11167,10 +11167,10 @@ ] }, "predict": { - "A": 0.009209617041051388, - "B": 0.017205828800797462, - "C": 0.9394063949584961, - "D": 0.002989924745634198 + "A": 0.0031150199938565493, + "B": 0.0021409199107438326, + "C": 0.9787101745605469, + "D": 0.0003960303438361734 }, "sample": { "messages": [ @@ -11212,10 +11212,10 @@ ] }, "predict": { - "A": 0.025073105469346046, - "B": 0.0038450853899121284, - "C": 0.0024825737345963717, - "D": 0.9408613443374634 + "A": 0.01043019350618124, + "B": 0.00048782661906443536, + "C": 0.001032728934660554, + "D": 0.9388960599899292 }, "sample": { "messages": [ @@ -11257,10 +11257,10 @@ ] }, "predict": { - "A": 0.937153697013855, - "B": 0.019450003281235695, - "C": 0.00337990396656096, - "D": 0.0015965537168085575 + "A": 0.8596862554550171, + "B": 0.003100512782111764, + "C": 0.003981137648224831, + "D": 0.000370303459931165 }, "sample": { "messages": [ @@ -11302,10 +11302,10 @@ ] }, "predict": { - "A": 0.0066993627697229385, - "B": 0.1957840770483017, - "C": 0.7743410468101501, - "D": 0.0031645549461245537 + "A": 0.000618723570369184, + "B": 0.0019058029865846038, + "C": 0.9872303605079651, + "D": 0.0003752748598344624 }, "sample": { "messages": [ @@ -11340,17 +11340,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "B" + "D" ] }, "predict": { - "A": 0.007386344950646162, - "B": 0.6648975610733032, - "C": 0.0011327327229082584, - "D": 0.31407538056373596 + "A": 0.008446644991636276, + "B": 0.35916048288345337, + "C": 0.001143129076808691, + "D": 0.5921555161476135 }, "sample": { "messages": [ @@ -11380,7 +11380,7 @@ "prompt_len": 75, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -11392,10 +11392,10 @@ ] }, "predict": { - "A": 0.0034624249674379826, - "B": 0.8472274541854858, - "C": 0.0007725714240223169, - "D": 0.12992653250694275 + "A": 0.002879815874621272, + "B": 0.9048112034797668, + "C": 0.0006840162095613778, + "D": 0.0350833386182785 }, "sample": { "messages": [ @@ -11437,10 +11437,10 @@ ] }, "predict": { - "A": 0.08716852962970734, - "B": 0.04665795713663101, - "C": 0.8270320296287537, - "D": 0.011796977370977402 + "A": 0.015580181032419205, + "B": 0.005058144219219685, + "C": 0.9639116525650024, + "D": 0.0014491825131699443 }, "sample": { "messages": [ @@ -11482,10 +11482,10 @@ ] }, "predict": { - "A": 0.044204745441675186, - "B": 0.007681633345782757, - "C": 0.03442669287323952, - "D": 0.8878760933876038 + "A": 0.010182025842368603, + "B": 0.0007375834393315017, + "C": 0.010182025842368603, + "D": 0.9165566563606262 }, "sample": { "messages": [ @@ -11527,10 +11527,10 @@ ] }, "predict": { - "A": 0.010690798051655293, - "B": 0.9623548984527588, - "C": 0.007347669918090105, - "D": 0.003932924475520849 + "A": 0.00703086843714118, + "B": 0.920862078666687, + "C": 0.0033211472909897566, + "D": 0.0015687990235164762 }, "sample": { "messages": [ @@ -11572,10 +11572,10 @@ ] }, "predict": { - "A": 0.007363362703472376, - "B": 0.9644102454185486, - "C": 0.00041541250539012253, - "D": 0.0005334002198651433 + "A": 0.0062462990172207355, + "B": 0.9270329475402832, + "C": 0.0005127274198457599, + "D": 0.0007008152897469699 }, "sample": { "messages": [ @@ -11617,10 +11617,10 @@ ] }, "predict": { - "A": 0.0021255044266581535, - "B": 0.9716630578041077, - "C": 0.0003259566437918693, - "D": 0.0014608362689614296 + "A": 0.0048125507310032845, + "B": 0.9171097874641418, + "C": 0.0004205159784760326, + "D": 0.002419902477413416 }, "sample": { "messages": [ @@ -11662,10 +11662,10 @@ ] }, "predict": { - "A": 0.027714012190699577, - "B": 0.8099222183227539, - "C": 0.09673141688108444, - "D": 0.03558550029993057 + "A": 0.11212754994630814, + "B": 0.6452495455741882, + "C": 0.14397463202476501, + "D": 0.05296530947089195 }, "sample": { "messages": [ @@ -11707,10 +11707,10 @@ ] }, "predict": { - "A": 0.05394141003489494, - "B": 0.31041139364242554, - "C": 0.0254801195114851, - "D": 0.5799248218536377 + "A": 0.12182382494211197, + "B": 0.25790104269981384, + "C": 0.0652075931429863, + "D": 0.48182258009910583 }, "sample": { "messages": [ @@ -11752,10 +11752,10 @@ ] }, "predict": { - "A": 0.5033151507377625, - "B": 0.4441740810871124, - "C": 0.011836838908493519, - "D": 0.02839510142803192 + "A": 0.6927666664123535, + "B": 0.22490841150283813, + "C": 0.030438045039772987, + "D": 0.020919742062687874 }, "sample": { "messages": [ @@ -11797,10 +11797,10 @@ ] }, "predict": { - "A": 0.008425075560808182, - "B": 0.9738063216209412, - "C": 0.00032667562481947243, - "D": 0.002130192704498768 + "A": 0.009020074270665646, + "B": 0.9200725555419922, + "C": 0.0006534119602292776, + "D": 0.003760126419365406 }, "sample": { "messages": [ @@ -11842,10 +11842,10 @@ ] }, "predict": { - "A": 0.9604753255844116, - "B": 0.000992456916719675, - "C": 0.00046880345325917006, - "D": 8.146581967594102e-05 + "A": 0.8349303603172302, + "B": 0.0001096840132959187, + "C": 0.00016988192510325462, + "D": 2.9521052056225017e-05 }, "sample": { "messages": [ @@ -11887,10 +11887,10 @@ ] }, "predict": { - "A": 0.017750808969140053, - "B": 0.003495341632515192, - "C": 0.969161331653595, - "D": 0.00039216646109707654 + "A": 0.004021170549094677, + "B": 0.00039814977208152413, + "C": 0.9839479327201843, + "D": 0.00010716056567616761 }, "sample": { "messages": [ @@ -11932,10 +11932,10 @@ ] }, "predict": { - "A": 0.14499182999134064, - "B": 0.7363293170928955, - "C": 0.06848929077386856, - "D": 0.013486341573297977 + "A": 0.2717277705669403, + "B": 0.5752477049827576, + "C": 0.0778513103723526, + "D": 0.006003248505294323 }, "sample": { "messages": [ @@ -11970,17 +11970,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "C" + "A" ] }, "predict": { - "A": 0.3135727345943451, - "B": 0.008356430567800999, - "C": 0.663833498954773, - "D": 0.00030438293470069766 + "A": 0.7879837155342102, + "B": 0.0007648904575034976, + "C": 0.13693101704120636, + "D": 8.0618861829862e-05 }, "sample": { "messages": [ @@ -12010,7 +12010,7 @@ "prompt_len": 60, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -12022,10 +12022,10 @@ ] }, "predict": { - "A": 0.06570722907781601, - "B": 0.014661264605820179, - "C": 0.9070603251457214, - "D": 0.0005684788920916617 + "A": 0.0361282154917717, + "B": 0.0029655846301466227, + "C": 0.9317589402198792, + "D": 0.00022868133964948356 }, "sample": { "messages": [ @@ -12067,10 +12067,10 @@ ] }, "predict": { - "A": 0.03660103306174278, - "B": 0.004953410942107439, - "C": 0.9439530968666077, - "D": 0.006360305938869715 + "A": 0.04655614122748375, + "B": 0.001094895415008068, + "C": 0.9351050853729248, + "D": 0.0018051775405183434 }, "sample": { "messages": [ @@ -12112,10 +12112,10 @@ ] }, "predict": { - "A": 0.0016735615208745003, - "B": 0.0018963934853672981, - "C": 0.0003295437491033226, - "D": 0.9823561310768127 + "A": 0.0014558712719008327, + "B": 0.0003248488064855337, + "C": 0.00019703076395671815, + "D": 0.9683606624603271 }, "sample": { "messages": [ @@ -12157,10 +12157,10 @@ ] }, "predict": { - "A": 0.012193499132990837, - "B": 0.9686495661735535, - "C": 0.0005357449408620596, - "D": 0.00165021070279181 + "A": 0.007218622136861086, + "B": 0.9454529881477356, + "C": 0.00046147112152539194, + "D": 0.001254408503882587 }, "sample": { "messages": [ @@ -12202,10 +12202,10 @@ ] }, "predict": { - "A": 0.21448691189289093, - "B": 0.009423897601664066, - "C": 0.7486328482627869, - "D": 0.0006826648022979498 + "A": 0.07143763452768326, + "B": 0.001229152432642877, + "C": 0.8702885508537292, + "D": 0.00014680132153443992 }, "sample": { "messages": [ @@ -12247,10 +12247,10 @@ ] }, "predict": { - "A": 0.9628058671951294, - "B": 0.0016402553301304579, - "C": 0.0003229853755328804, - "D": 0.0007748017087578773 + "A": 0.9235236048698425, + "B": 0.00014634277613367885, + "C": 8.876138599589467e-05, + "D": 0.00018790784815791994 }, "sample": { "messages": [ @@ -12292,10 +12292,10 @@ ] }, "predict": { - "A": 0.7974803447723389, - "B": 0.05098121240735054, - "C": 0.008859206922352314, - "D": 0.08405380696058273 + "A": 0.8913137316703796, + "B": 0.011219983920454979, + "C": 0.006392953917384148, + "D": 0.02691534347832203 }, "sample": { "messages": [ @@ -12337,10 +12337,10 @@ ] }, "predict": { - "A": 0.4776208996772766, - "B": 0.13684068620204926, - "C": 0.2556520700454712, - "D": 0.0940491259098053 + "A": 0.5614146590232849, + "B": 0.07597921043634415, + "C": 0.3005036413669586, + "D": 0.035890039056539536 }, "sample": { "messages": [ @@ -12382,10 +12382,10 @@ ] }, "predict": { - "A": 0.06643310934305191, - "B": 0.007934303022921085, - "C": 0.9170807003974915, - "D": 0.000539936067070812 + "A": 0.05930501967668533, + "B": 0.0029526231810450554, + "C": 0.9276866316795349, + "D": 0.0001773187832441181 }, "sample": { "messages": [ @@ -12427,10 +12427,10 @@ ] }, "predict": { - "A": 0.0008689457317814231, - "B": 0.9529146552085876, - "C": 0.00012518350558821112, - "D": 0.00019388800137676299 + "A": 0.001120464876294136, + "B": 0.8989645838737488, + "C": 0.00016141825472004712, + "D": 0.0003872223023790866 }, "sample": { "messages": [ @@ -12465,17 +12465,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "D" ] }, "predict": { - "A": 0.03440293297171593, - "B": 0.7830070853233337, - "C": 0.0028239646926522255, - "D": 0.15418322384357452 + "A": 0.13825061917304993, + "B": 0.03960946947336197, + "C": 0.0006402274593710899, + "D": 0.7955774664878845 }, "sample": { "messages": [ @@ -12505,7 +12505,7 @@ "prompt_len": 80, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -12517,10 +12517,10 @@ ] }, "predict": { - "A": 0.8110762238502502, - "B": 0.10976723581552505, - "C": 0.04038110747933388, - "D": 0.005464988760650158 + "A": 0.8852933049201965, + "B": 0.008679110556840897, + "C": 0.009834720753133297, + "D": 0.0014168258057907224 }, "sample": { "messages": [ @@ -12562,10 +12562,10 @@ ] }, "predict": { - "A": 0.9585928916931152, - "B": 0.006458947900682688, - "C": 0.00036438892129808664, - "D": 0.00022101304784882814 + "A": 0.8785814642906189, + "B": 0.0016960612265393138, + "C": 0.00020256561401765794, + "D": 9.56852309172973e-05 }, "sample": { "messages": [ @@ -12607,10 +12607,10 @@ ] }, "predict": { - "A": 0.9266770482063293, - "B": 0.027983224019408226, - "C": 0.00045230670366436243, - "D": 0.013218337669968605 + "A": 0.9069848656654358, + "B": 0.003706640098243952, + "C": 0.00026850809808820486, + "D": 0.003706640098243952 }, "sample": { "messages": [ @@ -12645,17 +12645,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.30775511264801025, - "B": 0.6515175700187683, - "C": 0.017362354323267937, - "D": 0.0012577248271554708 + "A": 0.9149996638298035, + "B": 0.014789591543376446, + "C": 0.0029122433625161648, + "D": 0.00016429752577096224 }, "sample": { "messages": [ @@ -12685,7 +12685,7 @@ "prompt_len": 92, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -12697,10 +12697,10 @@ ] }, "predict": { - "A": 0.9496868848800659, - "B": 0.00439792312681675, - "C": 0.0012600260088220239, - "D": 0.0014277964364737272 + "A": 0.8904411196708679, + "B": 0.0007165673887357116, + "C": 0.0005242518964223564, + "D": 0.001338724046945572 }, "sample": { "messages": [ @@ -12742,10 +12742,10 @@ ] }, "predict": { - "A": 0.03212623670697212, - "B": 0.017195936292409897, - "C": 0.9388663172721863, - "D": 0.000588412513025105 + "A": 0.05242360755801201, + "B": 0.0020326836965978146, + "C": 0.9292306303977966, + "D": 8.93098913365975e-05 }, "sample": { "messages": [ @@ -12787,10 +12787,10 @@ ] }, "predict": { - "A": 0.9452246427536011, - "B": 0.017312394455075264, - "C": 0.0011067442828789353, - "D": 0.001824712846428156 + "A": 0.8758726119995117, + "B": 0.0031588899437338114, + "C": 0.0008502036798745394, + "D": 0.0007048436091281474 }, "sample": { "messages": [ @@ -12832,10 +12832,10 @@ ] }, "predict": { - "A": 0.024841777980327606, - "B": 0.8226466774940491, - "C": 0.11133311688899994, - "D": 0.019346795976161957 + "A": 0.05936938524246216, + "B": 0.723267138004303, + "C": 0.1424197405576706, + "D": 0.01324710063636303 }, "sample": { "messages": [ @@ -12877,10 +12877,10 @@ ] }, "predict": { - "A": 0.027827944606542587, - "B": 0.007972839288413525, - "C": 0.02455807290971279, - "D": 0.9215348958969116 + "A": 0.007973406463861465, + "B": 0.0006544970674440265, + "C": 0.009035054594278336, + "D": 0.9216005206108093 }, "sample": { "messages": [ @@ -12922,10 +12922,10 @@ ] }, "predict": { - "A": 0.00037627166602760553, - "B": 9.513636905467138e-05, - "C": 1.2875307220383547e-05, - "D": 0.9898526072502136 + "A": 0.00032755109714344144, + "B": 1.967097159649711e-05, + "C": 5.294370112096658e-06, + "D": 0.9764160513877869 }, "sample": { "messages": [ @@ -12967,10 +12967,10 @@ ] }, "predict": { - "A": 0.6988697648048401, - "B": 0.008797472342848778, - "C": 0.004155631642788649, - "D": 0.25709983706474304 + "A": 0.8096667528152466, + "B": 0.0009480222361162305, + "C": 0.000540166802238673, + "D": 0.15943282842636108 }, "sample": { "messages": [ @@ -13012,10 +13012,10 @@ ] }, "predict": { - "A": 0.007389664184302092, - "B": 0.009488517418503761, - "C": 0.9678550958633423, - "D": 0.007389664184302092 + "A": 0.0019046548986807466, + "B": 0.0013090488500893116, + "C": 0.9866356253623962, + "D": 0.0007939782808534801 }, "sample": { "messages": [ @@ -13057,10 +13057,10 @@ ] }, "predict": { - "A": 0.0030867778696119785, - "B": 0.001002129982225597, - "C": 0.00021005785674788058, - "D": 0.9698367714881897 + "A": 0.003018271876499057, + "B": 0.0002186428610002622, + "C": 8.043421257752925e-05, + "D": 0.9483127593994141 }, "sample": { "messages": [ @@ -13095,17 +13095,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "A" ] }, "predict": { - "A": 0.30545148253440857, - "B": 0.019526859745383263, - "C": 0.6466407775878906, - "D": 0.002642672974616289 + "A": 0.6230327486991882, + "B": 0.004197961650788784, + "C": 0.2597186863422394, + "D": 0.002546192379668355 }, "sample": { "messages": [ @@ -13135,7 +13135,7 @@ "prompt_len": 93, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -13147,10 +13147,10 @@ ] }, "predict": { - "A": 0.12028726935386658, - "B": 0.1983201950788498, - "C": 0.2885541617870331, - "D": 0.37051087617874146 + "A": 0.1280355304479599, + "B": 0.06853248178958893, + "C": 0.3071412742137909, + "D": 0.44688791036605835 }, "sample": { "messages": [ @@ -13192,10 +13192,10 @@ ] }, "predict": { - "A": 0.8495403528213501, - "B": 0.06973452121019363, - "C": 0.01763164810836315, - "D": 0.0016399987507611513 + "A": 0.7987152934074402, + "B": 0.01657680608332157, + "C": 0.003698785789310932, + "D": 0.00047024781815707684 }, "sample": { "messages": [ @@ -13237,10 +13237,10 @@ ] }, "predict": { - "A": 0.0023960107937455177, - "B": 0.017704257741570473, - "C": 0.9666196703910828, - "D": 0.002715036040171981 + "A": 0.0002586387563496828, + "B": 0.00035351727274246514, + "C": 0.9899724721908569, + "D": 0.0001568723382661119 }, "sample": { "messages": [ @@ -13275,17 +13275,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "D" + "A" ] }, "predict": { - "A": 0.2944811284542084, - "B": 0.06570761650800705, - "C": 0.002547760959714651, - "D": 0.623416543006897 + "A": 0.5575661063194275, + "B": 0.010212179273366928, + "C": 0.0008382666856050491, + "D": 0.3832091987133026 }, "sample": { "messages": [ @@ -13315,7 +13315,7 @@ "prompt_len": 103, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -13327,10 +13327,10 @@ ] }, "predict": { - "A": 0.028025222942233086, - "B": 0.8190170526504517, - "C": 0.013238177634775639, - "D": 0.12560033798217773 + "A": 0.02092425338923931, + "B": 0.8897218108177185, + "C": 0.007697602268308401, + "D": 0.044296640902757645 }, "sample": { "messages": [ @@ -13372,10 +13372,10 @@ ] }, "predict": { - "A": 0.02885315753519535, - "B": 0.9554853439331055, - "C": 0.0018445206806063652, - "D": 0.0030411004554480314 + "A": 0.2849080264568329, + "B": 0.6834588050842285, + "C": 0.0009067997452802956, + "D": 0.0019196952925994992 }, "sample": { "messages": [ @@ -13417,10 +13417,10 @@ ] }, "predict": { - "A": 0.0016473927535116673, - "B": 0.004478077404201031, - "C": 0.0004433896974660456, - "D": 0.9669954180717468 + "A": 0.0004903904628008604, + "B": 0.0008606621413491666, + "C": 0.0001804045750759542, + "D": 0.9438306093215942 }, "sample": { "messages": [ @@ -13462,10 +13462,10 @@ ] }, "predict": { - "A": 0.9292485117912292, - "B": 0.006261227186769247, - "C": 0.002032723044976592, - "D": 0.00016685605805832893 + "A": 0.7964016795158386, + "B": 0.0006020611035637558, + "C": 0.0009324904531240463, + "D": 5.961212809779681e-05 }, "sample": { "messages": [ @@ -13507,10 +13507,10 @@ ] }, "predict": { - "A": 0.9809083938598633, - "B": 0.005832694470882416, - "C": 8.319891639985144e-05, - "D": 5.0462698709452525e-05 + "A": 0.913475513458252, + "B": 0.0009438920533284545, + "C": 2.2198213628144003e-05, + "D": 1.5256594451784622e-05 }, "sample": { "messages": [ @@ -13552,10 +13552,10 @@ ] }, "predict": { - "A": 0.9598485827445984, - "B": 0.0014430738519877195, - "C": 0.00011845472181448713, - "D": 0.0005308772088028491 + "A": 0.892905056476593, + "B": 0.00010351696982979774, + "C": 2.7861213311553e-05, + "D": 7.573462789878249e-05 }, "sample": { "messages": [ @@ -13597,10 +13597,10 @@ ] }, "predict": { - "A": 0.006569379474967718, - "B": 0.0031031554099172354, - "C": 0.9749823808670044, - "D": 0.0008890688768588006 + "A": 0.0011471601901575923, + "B": 0.00017592271615285426, + "C": 0.9797422289848328, + "D": 8.845949923852459e-05 }, "sample": { "messages": [ @@ -13642,10 +13642,10 @@ ] }, "predict": { - "A": 0.12315786629915237, - "B": 0.4870988130569458, - "C": 0.06592164933681488, - "D": 0.29544034600257874 + "A": 0.06835056841373444, + "B": 0.5050461888313293, + "C": 0.060319166630506516, + "D": 0.3063260018825531 }, "sample": { "messages": [ @@ -13680,17 +13680,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "C" + "A" ] }, "predict": { - "A": 0.36374959349632263, - "B": 0.03833892196416855, - "C": 0.46706369519233704, - "D": 0.08116349577903748 + "A": 0.4846256375312805, + "B": 0.006912813056260347, + "C": 0.3774268329143524, + "D": 0.027340708300471306 }, "sample": { "messages": [ @@ -13720,7 +13720,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -13732,10 +13732,10 @@ ] }, "predict": { - "A": 0.001658107154071331, - "B": 0.9732846617698669, - "C": 0.00012785948638338596, - "D": 0.0003265006234869361 + "A": 0.0008473891066387296, + "B": 0.9292749762535095, + "C": 4.49099788966123e-05, + "D": 0.000214253508602269 }, "sample": { "messages": [ @@ -13777,10 +13777,10 @@ ] }, "predict": { - "A": 0.9703651666641235, - "B": 0.006538269110023975, - "C": 0.0018732454627752304, - "D": 0.0024052949156612158 + "A": 0.8898710608482361, + "B": 0.001516002113930881, + "C": 0.001337866997346282, + "D": 0.00029851848375983536 }, "sample": { "messages": [ @@ -13822,10 +13822,10 @@ ] }, "predict": { - "A": 0.6119654178619385, - "B": 0.32756149768829346, - "C": 0.004123390652239323, - "D": 0.018479755148291588 + "A": 0.5074068307876587, + "B": 0.39516884088516235, + "C": 0.0020736558362841606, + "D": 0.009293480776250362 }, "sample": { "messages": [ @@ -13867,10 +13867,10 @@ ] }, "predict": { - "A": 0.9443578720092773, - "B": 0.0038593746721744537, - "C": 0.002065774518996477, - "D": 0.0005918542155995965 + "A": 0.903435468673706, + "B": 0.00025125290267169476, + "C": 0.0006027243798598647, + "D": 6.352668424369767e-05 }, "sample": { "messages": [ @@ -13912,10 +13912,10 @@ ] }, "predict": { - "A": 0.8202772736549377, - "B": 0.019291073083877563, - "C": 0.05243856832385063, - "D": 0.07629767060279846 + "A": 0.44335928559303284, + "B": 0.004925277084112167, + "C": 0.44335928559303284, + "D": 0.02501262165606022 }, "sample": { "messages": [ @@ -13945,22 +13945,22 @@ "prompt_len": 86, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "C" + "A" ] }, "predict": { - "A": 0.3183550536632538, - "B": 0.048821330070495605, - "C": 0.4087759852409363, - "D": 0.19309210777282715 + "A": 0.6505192518234253, + "B": 0.008188828825950623, + "C": 0.08803820610046387, + "D": 0.1863768994808197 }, "sample": { "messages": [ @@ -13990,7 +13990,7 @@ "prompt_len": 67, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -14002,10 +14002,10 @@ ] }, "predict": { - "A": 0.0034789242781698704, - "B": 0.0034789242781698704, - "C": 0.003942137584090233, - "D": 0.9646092653274536 + "A": 0.0020834750030189753, + "B": 0.0008685215725563467, + "C": 0.0014319499023258686, + "D": 0.9524495601654053 }, "sample": { "messages": [ @@ -14047,10 +14047,10 @@ ] }, "predict": { - "A": 0.9562296867370605, - "B": 0.0073009030893445015, - "C": 0.0008719685720279813, - "D": 0.0005288756801746786 + "A": 0.9031164646148682, + "B": 0.0006827350007370114, + "C": 0.0003029618237633258, + "D": 0.0001523387763882056 }, "sample": { "messages": [ @@ -14092,10 +14092,10 @@ ] }, "predict": { - "A": 0.5919008255004883, - "B": 0.07069247215986252, - "C": 0.0031060106121003628, - "D": 0.27959415316581726 + "A": 0.5163981318473816, + "B": 0.06988688558340073, + "C": 0.002246510237455368, + "D": 0.3132112920284271 }, "sample": { "messages": [ @@ -14137,10 +14137,10 @@ ] }, "predict": { - "A": 0.0006985449581407011, - "B": 0.9836245775222778, - "C": 0.0003739041858352721, - "D": 0.00042368893628008664 + "A": 0.0003100005560554564, + "B": 0.9240986704826355, + "C": 0.00037393203820101917, + "D": 0.0002912186027970165 }, "sample": { "messages": [ @@ -14182,10 +14182,10 @@ ] }, "predict": { - "A": 0.49818456172943115, - "B": 0.09809835255146027, - "C": 0.2666589915752411, - "D": 0.09809835255146027 + "A": 0.4862973988056183, + "B": 0.05807994306087494, + "C": 0.15787765383720398, + "D": 0.17889884114265442 }, "sample": { "messages": [ @@ -14227,10 +14227,10 @@ ] }, "predict": { - "A": 0.0004839761240873486, - "B": 0.0011609982466325164, - "C": 0.9915607571601868, - "D": 0.0009041863959282637 + "A": 0.00027492750086821616, + "B": 0.0001471580908400938, + "C": 0.988562822341919, + "D": 0.00018895472749136388 }, "sample": { "messages": [ @@ -14272,10 +14272,10 @@ ] }, "predict": { - "A": 0.002342597581446171, - "B": 0.02853868156671524, - "C": 0.9450713396072388, - "D": 0.0026545110158622265 + "A": 0.0006591727142222226, + "B": 0.02473527379333973, + "C": 0.9281843304634094, + "D": 0.0010867920937016606 }, "sample": { "messages": [ @@ -14317,10 +14317,10 @@ ] }, "predict": { - "A": 0.0464814268052578, - "B": 0.0026223030872642994, - "C": 0.00027638868778012693, - "D": 0.9336044788360596 + "A": 0.035336121916770935, + "B": 0.0008310260018333793, + "C": 0.00010565310367383063, + "D": 0.9113306403160095 }, "sample": { "messages": [ @@ -14362,10 +14362,10 @@ ] }, "predict": { - "A": 0.9583302736282349, - "B": 0.004437949974089861, - "C": 0.0006006111507304013, - "D": 0.0026917527429759502 + "A": 0.9031568169593811, + "B": 0.0005660324241034687, + "C": 0.00022166152484714985, + "D": 0.0004692574148066342 }, "sample": { "messages": [ @@ -14407,10 +14407,10 @@ ] }, "predict": { - "A": 0.0011574000818654895, - "B": 0.9884876608848572, - "C": 0.00013823172776028514, - "D": 8.3841776358895e-05 + "A": 0.002044237917289138, + "B": 0.9345125555992126, + "C": 0.00022935715969651937, + "D": 8.437578071607277e-05 }, "sample": { "messages": [ @@ -14452,10 +14452,10 @@ ] }, "predict": { - "A": 0.012229138053953648, - "B": 0.004226277116686106, - "C": 0.0010038287146016955, - "D": 0.9714807271957397 + "A": 0.008424880914390087, + "B": 0.00047529928269796073, + "C": 0.00021091275266371667, + "D": 0.9737837910652161 }, "sample": { "messages": [ @@ -14497,10 +14497,10 @@ ] }, "predict": { - "A": 0.0003720140375662595, - "B": 9.405987657373771e-05, - "C": 9.405987657373771e-05, - "D": 0.9786521792411804 + "A": 0.0001387550146318972, + "B": 2.2651132894679904e-05, + "C": 2.2651132894679904e-05, + "D": 0.932113528251648 }, "sample": { "messages": [ @@ -14542,10 +14542,10 @@ ] }, "predict": { - "A": 0.007395404856652021, - "B": 0.0034933423157781363, - "C": 0.00022332188382279128, - "D": 0.9686069488525391 + "A": 0.001699518645182252, + "B": 0.0002160695439670235, + "C": 6.190496060298756e-05, + "D": 0.9371516108512878 }, "sample": { "messages": [ @@ -14580,17 +14580,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "C" ] }, "predict": { - "A": 0.74624103307724, - "B": 0.0175499077886343, - "C": 0.1886792778968811, - "D": 0.00569762010127306 + "A": 0.3586006462574005, + "B": 0.003515596967190504, + "C": 0.5912325382232666, + "D": 0.0008350274874828756 }, "sample": { "messages": [ @@ -14620,7 +14620,7 @@ "prompt_len": 68, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -14632,10 +14632,10 @@ ] }, "predict": { - "A": 0.013617509976029396, - "B": 0.009359169751405716, - "C": 0.005676622968167067, - "D": 0.9546611905097961 + "A": 0.0026701365131884813, + "B": 0.0008143455488607287, + "C": 0.003025660989806056, + "D": 0.9506344199180603 }, "sample": { "messages": [ @@ -14677,10 +14677,10 @@ ] }, "predict": { - "A": 0.039910539984703064, - "B": 0.0352209247648716, - "C": 0.9083596467971802, - "D": 0.006120479200035334 + "A": 0.022641029208898544, + "B": 0.0027040853165090084, + "C": 0.962721049785614, + "D": 0.0008778879418969154 }, "sample": { "messages": [ @@ -14722,10 +14722,10 @@ ] }, "predict": { - "A": 0.046922698616981506, - "B": 0.34671446681022644, - "C": 0.07736245542764664, - "D": 0.5044665336608887 + "A": 0.028114723041653633, + "B": 0.18333102762699127, + "C": 0.1111958771944046, + "D": 0.6398881673812866 }, "sample": { "messages": [ @@ -14767,10 +14767,10 @@ ] }, "predict": { - "A": 0.796421468257904, - "B": 0.10778392851352692, - "C": 0.05769257992506027, - "D": 0.0010566763812676072 + "A": 0.8748622536659241, + "B": 0.006679655518382788, + "C": 0.012479239143431187, + "D": 0.00029348357929848135 }, "sample": { "messages": [ @@ -14812,10 +14812,10 @@ ] }, "predict": { - "A": 0.005714669357985258, - "B": 0.0018552816472947598, - "C": 0.31201040744781494, - "D": 0.6605259776115417 + "A": 0.0023128637112677097, + "B": 0.0004554298357106745, + "C": 0.4407537579536438, + "D": 0.49943944811820984 }, "sample": { "messages": [ @@ -14857,10 +14857,10 @@ ] }, "predict": { - "A": 0.022787120193243027, - "B": 0.9689329862594604, - "C": 8.218318544095382e-05, - "D": 0.00036831951001659036 + "A": 0.01329279039055109, + "B": 0.9318966269493103, + "C": 6.155783194117248e-05, + "D": 0.00027588309603743255 }, "sample": { "messages": [ @@ -14902,10 +14902,10 @@ ] }, "predict": { - "A": 0.9445217847824097, - "B": 0.004956395365297794, - "C": 0.011889775283634663, - "D": 0.0004610166361089796 + "A": 0.8731922507286072, + "B": 0.0004262010334059596, + "C": 0.007554593496024609, + "D": 8.392395102418959e-05 }, "sample": { "messages": [ @@ -14947,10 +14947,10 @@ ] }, "predict": { - "A": 0.0010003099450841546, - "B": 0.9680753350257874, - "C": 0.006522840354591608, - "D": 0.00041699124267324805 + "A": 0.0006533028208650649, + "B": 0.9199188947677612, + "C": 0.007023666985332966, + "D": 0.00024033666704781353 }, "sample": { "messages": [ @@ -14992,10 +14992,10 @@ ] }, "predict": { - "A": 0.058383192867040634, - "B": 0.016727065667510033, - "C": 0.9132668375968933, - "D": 0.0010693254880607128 + "A": 0.03681665658950806, + "B": 0.004397123120725155, + "C": 0.9495141506195068, + "D": 0.0003842163132503629 }, "sample": { "messages": [ @@ -15037,10 +15037,10 @@ ] }, "predict": { - "A": 0.013597667217254639, - "B": 0.010589874349534512, - "C": 0.006423083133995533, - "D": 0.9532700777053833 + "A": 0.013433434069156647, + "B": 0.004941884428262711, + "C": 0.00436119781807065, + "D": 0.9417564272880554 }, "sample": { "messages": [ @@ -15082,10 +15082,10 @@ ] }, "predict": { - "A": 0.0005443885456770658, - "B": 0.9842775464057922, - "C": 0.0007920806529000401, - "D": 0.00048042123671621084 + "A": 0.0007993472390808165, + "B": 0.933125913143158, + "C": 0.002312988042831421, + "D": 0.00048482860438525677 }, "sample": { "messages": [ @@ -15120,17 +15120,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "C" + "D" ] }, "predict": { - "A": 0.004399552475661039, - "B": 0.07798393070697784, - "C": 0.5085191130638123, - "D": 0.3960350751876831 + "A": 0.004417793825268745, + "B": 0.014485389925539494, + "C": 0.3296859562397003, + "D": 0.615934431552887 }, "sample": { "messages": [ @@ -15160,7 +15160,7 @@ "prompt_len": 78, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " D" } } { @@ -15172,10 +15172,10 @@ ] }, "predict": { - "A": 0.9686350226402283, - "B": 0.010760564357042313, - "C": 0.0007794926059432328, - "D": 0.0008832808234728873 + "A": 0.9218906164169312, + "B": 0.0015705511905252934, + "C": 0.00019967314437963068, + "D": 0.0001762109313858673 }, "sample": { "messages": [ @@ -15217,10 +15217,10 @@ ] }, "predict": { - "A": 0.9648588299751282, - "B": 0.0027100900188088417, - "C": 0.0027100900188088417, - "D": 0.0012801557313650846 + "A": 0.9086620211601257, + "B": 0.0009389183251187205, + "C": 0.004207940306514502, + "D": 0.0008285925141535699 }, "sample": { "messages": [ @@ -15262,10 +15262,10 @@ ] }, "predict": { - "A": 0.006541573442518711, - "B": 0.007412573788315058, - "C": 0.0021237381733953953, - "D": 0.9708555936813354 + "A": 0.0026780464686453342, + "B": 0.0006771160988137126, + "C": 0.00046537467278540134, + "D": 0.9534505009651184 }, "sample": { "messages": [ @@ -15307,10 +15307,10 @@ ] }, "predict": { - "A": 0.1434839963912964, - "B": 0.0003138699976261705, - "C": 0.0008531871135346591, - "D": 0.8256934881210327 + "A": 0.15394838154315948, + "B": 7.058888877509162e-05, + "C": 0.00035847994149662554, + "D": 0.781814455986023 }, "sample": { "messages": [ @@ -15352,10 +15352,10 @@ ] }, "predict": { - "A": 0.09022407978773117, - "B": 0.8560222387313843, - "C": 0.015678593888878822, - "D": 0.010775730013847351 + "A": 0.08491140604019165, + "B": 0.8056170344352722, + "C": 0.02432752586901188, + "D": 0.010141221806406975 }, "sample": { "messages": [ @@ -15397,10 +15397,10 @@ ] }, "predict": { - "A": 0.007438416592776775, - "B": 0.0007840034086257219, - "C": 0.0012926030904054642, - "D": 0.9742403626441956 + "A": 0.0026629813946783543, + "B": 0.00019290570344310254, + "C": 0.0007167316507548094, + "D": 0.9480869770050049 }, "sample": { "messages": [ @@ -15442,10 +15442,10 @@ ] }, "predict": { - "A": 0.05600859597325325, - "B": 0.13435763120651245, - "C": 0.7731748223304749, - "D": 0.0019165087724104524 + "A": 0.12131290882825851, + "B": 0.09447859227657318, + "C": 0.6981076598167419, + "D": 0.0028530063573271036 }, "sample": { "messages": [ @@ -15480,17 +15480,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "A" ] }, "predict": { - "A": 0.34283027052879333, - "B": 0.052574723958969116, - "C": 0.5652315616607666, - "D": 0.0009629398118704557 + "A": 0.6678707003593445, + "B": 0.006547573953866959, + "C": 0.19134816527366638, + "D": 0.00021047142217867076 }, "sample": { "messages": [ @@ -15520,7 +15520,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -15532,10 +15532,10 @@ ] }, "predict": { - "A": 0.9583373069763184, - "B": 0.0044379825703799725, - "C": 0.0009902475867420435, - "D": 0.0020963542629033327 + "A": 0.9224777817726135, + "B": 0.0006551200640387833, + "C": 0.00030945680919103324, + "D": 0.0004502570373006165 }, "sample": { "messages": [ @@ -15577,10 +15577,10 @@ ] }, "predict": { - "A": 0.026729516685009003, - "B": 0.6893633604049683, - "C": 0.25360262393951416, - "D": 0.00675828056409955 + "A": 0.03027648665010929, + "B": 0.7808408737182617, + "C": 0.11974583566188812, + "D": 0.005961793474853039 }, "sample": { "messages": [ @@ -15622,10 +15622,10 @@ ] }, "predict": { - "A": 0.03514638915657997, - "B": 0.906437337398529, - "C": 0.024155735969543457, - "D": 0.014651195146143436 + "A": 0.11200755089521408, + "B": 0.7303809523582458, + "C": 0.059953320771455765, + "D": 0.0412052758038044 }, "sample": { "messages": [ @@ -15667,10 +15667,10 @@ ] }, "predict": { - "A": 0.02533935382962227, - "B": 0.9508522748947144, - "C": 0.0034293087664991617, - "D": 0.004989614710211754 + "A": 0.009701898321509361, + "B": 0.8733370304107666, + "C": 0.0040443530306220055, + "D": 0.005884498357772827 }, "sample": { "messages": [ @@ -15712,10 +15712,10 @@ ] }, "predict": { - "A": 1.1355356036801822e-05, - "B": 0.989237368106842, - "C": 3.2533641842746874e-06, - "D": 7.404623465845361e-05 + "A": 1.3661217963090166e-05, + "B": 0.9268630146980286, + "C": 7.78392768552294e-06, + "D": 0.00011438398360041901 }, "sample": { "messages": [ @@ -15757,10 +15757,10 @@ ] }, "predict": { - "A": 0.9684458374977112, - "B": 0.017737705260515213, - "C": 0.00028670328902080655, - "D": 0.0002530147321522236 + "A": 0.8970885872840881, + "B": 0.0068493555299937725, + "C": 0.0002343720552744344, + "D": 8.62206652527675e-05 }, "sample": { "messages": [ @@ -15802,10 +15802,10 @@ ] }, "predict": { - "A": 0.004465719219297171, - "B": 0.002390327164903283, - "C": 0.0030692408327013254, - "D": 0.9643267393112183 + "A": 0.00160309299826622, + "B": 0.0010350345401093364, + "C": 0.0012484899489209056, + "D": 0.9409921169281006 }, "sample": { "messages": [ @@ -15847,10 +15847,10 @@ ] }, "predict": { - "A": 0.0018923579482361674, - "B": 0.00660497834905982, - "C": 0.9802656769752502, - "D": 0.00022600992815569043 + "A": 0.001300897914916277, + "B": 0.0011480383109301329, + "C": 0.9804922342300415, + "D": 5.71574637433514e-05 }, "sample": { "messages": [ @@ -15892,10 +15892,10 @@ ] }, "predict": { - "A": 0.4791972041130066, - "B": 0.0046978844329714775, - "C": 0.011269638314843178, - "D": 0.4791972041130066 + "A": 0.8192043304443359, + "B": 0.0029545121360570192, + "C": 0.006254701875150204, + "D": 0.09784000366926193 }, "sample": { "messages": [ @@ -15937,10 +15937,10 @@ ] }, "predict": { - "A": 0.02903277799487114, - "B": 0.07891926914453506, - "C": 0.8484621644020081, - "D": 0.022610750049352646 + "A": 0.007415410131216049, + "B": 0.00449767429381609, + "C": 0.9712271094322205, + "D": 0.0011371900327503681 }, "sample": { "messages": [ @@ -15982,10 +15982,10 @@ ] }, "predict": { - "A": 0.0587158277630806, - "B": 0.6312546730041504, - "C": 0.12430140376091003, - "D": 0.1596061736345291 + "A": 0.03398439660668373, + "B": 0.36536672711372375, + "C": 0.36536672711372375, + "D": 0.17258702218532562 }, "sample": { "messages": [ @@ -16015,7 +16015,7 @@ "prompt_len": 79, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -16027,10 +16027,10 @@ ] }, "predict": { - "A": 0.003953514155000448, - "B": 0.9673929810523987, - "C": 0.0012835180386900902, - "D": 0.00034545420203357935 + "A": 0.003350759157910943, + "B": 0.9290726184844971, + "C": 0.0008472045883536339, + "D": 0.00031166913686320186 }, "sample": { "messages": [ @@ -16072,10 +16072,10 @@ ] }, "predict": { - "A": 0.0014354254817590117, - "B": 0.01201867125928402, - "C": 0.00024943953030742705, - "D": 0.954761266708374 + "A": 0.0005656913272105157, + "B": 0.0006410122732631862, + "C": 9.234657045453787e-05, + "D": 0.9026125073432922 }, "sample": { "messages": [ @@ -16117,10 +16117,10 @@ ] }, "predict": { - "A": 0.9563735723495483, - "B": 0.003908480517566204, - "C": 0.0005289552500471473, - "D": 0.015458338893949986 + "A": 0.9199467301368713, + "B": 0.0005088081816211343, + "C": 0.00019925212836824358, + "D": 0.0033178459852933884 }, "sample": { "messages": [ @@ -16162,10 +16162,10 @@ ] }, "predict": { - "A": 0.9639147520065308, - "B": 0.0009960108436644077, - "C": 0.0008789764833636582, - "D": 0.0003233573806937784 + "A": 0.8344948291778564, + "B": 0.0001924009993672371, + "C": 0.0005926368176005781, + "D": 7.5345320510678e-05 }, "sample": { "messages": [ @@ -16207,10 +16207,10 @@ ] }, "predict": { - "A": 0.021968277171254158, - "B": 0.934114933013916, - "C": 0.021968277171254158, - "D": 0.005554450210183859 + "A": 0.14854547381401062, + "B": 0.754376232624054, + "C": 0.029250340536236763, + "D": 0.003718763589859009 }, "sample": { "messages": [ @@ -16245,17 +16245,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "A" ] }, "predict": { - "A": 0.0890006572008133, - "B": 0.0890006572008133, - "C": 0.5803571939468384, - "D": 0.21350149810314178 + "A": 0.459576815366745, + "B": 0.017819726839661598, + "C": 0.459576815366745, + "D": 0.013878017663955688 }, "sample": { "messages": [ @@ -16297,10 +16297,10 @@ ] }, "predict": { - "A": 0.00497639924287796, - "B": 0.00497639924287796, - "C": 0.00040848771459423006, - "D": 0.9483337998390198 + "A": 0.004322811495512724, + "B": 0.0002941710117738694, + "C": 0.00018993107369169593, + "D": 0.9334673285484314 }, "sample": { "messages": [ @@ -16342,10 +16342,10 @@ ] }, "predict": { - "A": 8.407505083596334e-05, - "B": 0.9912379384040833, - "C": 3.971423939219676e-05, - "D": 0.0011606202460825443 + "A": 8.590110519435257e-05, + "B": 0.9514064788818359, + "C": 5.2101651817793027e-05, + "D": 0.001836646581068635 }, "sample": { "messages": [ @@ -16387,10 +16387,10 @@ ] }, "predict": { - "A": 0.19421353936195374, - "B": 0.014068781398236752, - "C": 0.0045674643479287624, - "D": 0.7681294679641724 + "A": 0.12760914862155914, + "B": 0.0020626098848879337, + "C": 0.0015090374508872628, + "D": 0.8321161866188049 }, "sample": { "messages": [ @@ -16432,10 +16432,10 @@ ] }, "predict": { - "A": 0.005174861755222082, - "B": 0.9861540794372559, - "C": 9.478089486947283e-05, - "D": 0.000200651164050214 + "A": 0.009308341890573502, + "B": 0.9494766592979431, + "C": 0.00011717472079908475, + "D": 0.00019318846170790493 }, "sample": { "messages": [ @@ -16477,10 +16477,10 @@ ] }, "predict": { - "A": 0.9477834701538086, - "B": 0.010528923943638802, - "C": 0.0016146628186106682, - "D": 0.0007627126760780811 + "A": 0.9196414351463318, + "B": 0.0022795633412897587, + "C": 0.00083860446466133, + "D": 0.0009502633474767208 }, "sample": { "messages": [ @@ -16522,10 +16522,10 @@ ] }, "predict": { - "A": 0.0039958832785487175, - "B": 0.977760374546051, - "C": 0.00013673161447513849, - "D": 0.00017556684906594455 + "A": 0.004270701203495264, + "B": 0.9222146272659302, + "C": 0.00017627286433707923, + "D": 0.00032932107569649816 }, "sample": { "messages": [ @@ -16567,10 +16567,10 @@ ] }, "predict": { - "A": 0.00834367424249649, - "B": 0.9643976092338562, - "C": 0.0005333932349458337, - "D": 0.0008794167661108077 + "A": 0.008695660158991814, + "B": 0.8869814276695251, + "C": 0.00046085307258181274, + "D": 0.0006299114902503788 }, "sample": { "messages": [ @@ -16612,10 +16612,10 @@ ] }, "predict": { - "A": 0.0006097787409089506, - "B": 0.0012909016804769635, - "C": 0.00018597202142700553, - "D": 0.9729580879211426 + "A": 0.00018076378910336643, + "B": 0.00012423701991792768, + "C": 4.865191658609547e-05, + "D": 0.9457099437713623 }, "sample": { "messages": [ @@ -16657,10 +16657,10 @@ ] }, "predict": { - "A": 0.07377681136131287, - "B": 0.8987855315208435, - "C": 0.0008195863338187337, - "D": 0.0005632928223349154 + "A": 0.04390401393175125, + "B": 0.8818357586860657, + "C": 0.0002958229451905936, + "D": 0.0006262571550905704 }, "sample": { "messages": [ @@ -16702,10 +16702,10 @@ ] }, "predict": { - "A": 0.0010099727660417557, - "B": 0.0010099727660417557, - "C": 0.00047707741032354534, - "D": 0.9774267673492432 + "A": 0.00035403689253143966, + "B": 8.951454219641164e-05, + "C": 0.00013024290092289448, + "D": 0.9313599467277527 }, "sample": { "messages": [ @@ -16747,10 +16747,10 @@ ] }, "predict": { - "A": 0.010817848145961761, - "B": 0.9737915992736816, - "C": 0.0010062165092676878, - "D": 0.003979663830250502 + "A": 0.024294620379805565, + "B": 0.9116489291191101, + "C": 0.001759896520525217, + "D": 0.007887308485805988 }, "sample": { "messages": [ @@ -16792,10 +16792,10 @@ ] }, "predict": { - "A": 0.001025545410811901, - "B": 0.0006220247596502304, - "C": 1.0702527106332127e-05, - "D": 0.9924976229667664 + "A": 0.0024133946280926466, + "B": 0.00037010604864917696, + "C": 1.117623378377175e-05, + "D": 0.9736328125 }, "sample": { "messages": [ @@ -16837,10 +16837,10 @@ ] }, "predict": { - "A": 0.045138049870729446, - "B": 0.021321706473827362, - "C": 0.0018630698323249817, - "D": 0.9066219925880432 + "A": 0.029406476765871048, + "B": 0.054938528686761856, + "C": 0.0009452695958316326, + "D": 0.859383225440979 }, "sample": { "messages": [ @@ -16882,10 +16882,10 @@ ] }, "predict": { - "A": 0.9734748601913452, - "B": 0.0027342906687408686, - "C": 0.00013613230839837343, - "D": 0.00013613230839837343 + "A": 0.9141581654548645, + "B": 0.0003937668225262314, + "C": 3.662601739051752e-05, + "D": 5.0061833462677896e-05 }, "sample": { "messages": [ @@ -16927,10 +16927,10 @@ ] }, "predict": { - "A": 0.027297867462038994, - "B": 0.7977606654167175, - "C": 0.0032602655701339245, - "D": 0.15708836913108826 + "A": 0.01925131119787693, + "B": 0.7224001288414001, + "C": 0.006249985657632351, + "D": 0.20697110891342163 }, "sample": { "messages": [ @@ -16972,10 +16972,10 @@ ] }, "predict": { - "A": 0.9426575899124146, - "B": 0.011866307817399502, - "C": 0.0007585876737721264, - "D": 0.01344628818333149 + "A": 0.9021087288856506, + "B": 0.0009321467950940132, + "C": 0.00011850917508127168, + "D": 0.001196900149807334 }, "sample": { "messages": [ @@ -17017,10 +17017,10 @@ ] }, "predict": { - "A": 0.9719768166542053, - "B": 0.0016558790812268853, - "C": 0.00017452835163567215, - "D": 0.007421134039759636 + "A": 0.9178689122200012, + "B": 0.0005752529250457883, + "C": 0.00017544222646392882, + "D": 0.005457847844809294 }, "sample": { "messages": [ @@ -17062,10 +17062,10 @@ ] }, "predict": { - "A": 0.006224906537681818, - "B": 0.059060268104076385, - "C": 0.9238580465316772, - "D": 0.0003099198511335999 + "A": 0.0023614238016307354, + "B": 0.028768030926585197, + "C": 0.9526663422584534, + "D": 0.00016069690173026174 }, "sample": { "messages": [ @@ -17100,17 +17100,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "D" + "C" ] }, "predict": { - "A": 0.007141970098018646, - "B": 0.011775117367506027, - "C": 0.3899383544921875, - "D": 0.5673569440841675 + "A": 0.0012490339577198029, + "B": 0.001707226736471057, + "C": 0.7331646084785461, + "D": 0.23802369832992554 }, "sample": { "messages": [ @@ -17140,7 +17140,7 @@ "prompt_len": 79, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " C" } } { @@ -17152,10 +17152,10 @@ ] }, "predict": { - "A": 0.000703337718732655, - "B": 0.0027817548252642155, - "C": 0.9903732538223267, - "D": 0.000703337718732655 + "A": 0.0001780190650606528, + "B": 0.0001780190650606528, + "C": 0.9914172291755676, + "D": 0.00012235059693921357 }, "sample": { "messages": [ @@ -17197,10 +17197,10 @@ ] }, "predict": { - "A": 0.1237424835562706, - "B": 0.024366337805986404, - "C": 0.806902289390564, - "D": 0.011509843170642853 + "A": 0.15512630343437195, + "B": 0.00725535349920392, + "C": 0.7877963781356812, + "D": 0.005650475155562162 }, "sample": { "messages": [ @@ -17242,10 +17242,10 @@ ] }, "predict": { - "A": 0.06628981977701187, - "B": 0.9151026606559753, - "C": 0.0033003755379468203, - "D": 0.0007364133489318192 + "A": 0.2494138479232788, + "B": 0.6779770255088806, + "C": 0.00853447150439024, + "D": 0.0007005520747043192 }, "sample": { "messages": [ @@ -17287,10 +17287,10 @@ ] }, "predict": { - "A": 0.0040421392768621445, - "B": 0.9890788197517395, - "C": 0.00020124626462347806, - "D": 0.0011580921709537506 + "A": 0.0023837066255509853, + "B": 0.9616559147834778, + "C": 0.00013447953097056597, + "D": 0.0006829433841630816 }, "sample": { "messages": [ @@ -17325,17 +17325,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.3984791040420532, - "B": 0.053928278386592865, - "C": 0.015450711362063885, - "D": 0.5116572976112366 + "A": 0.598131537437439, + "B": 0.010291420854628086, + "C": 0.007073183078318834, + "D": 0.36278510093688965 }, "sample": { "messages": [ @@ -17365,7 +17365,7 @@ "prompt_len": 68, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -17377,10 +17377,10 @@ ] }, "predict": { - "A": 0.0031384665053337812, - "B": 0.986076831817627, - "C": 0.0008991857175715268, - "D": 0.00020063544798176736 + "A": 0.003381860675290227, + "B": 0.9376962184906006, + "C": 0.0018101795576512814, + "D": 0.0008032622863538563 }, "sample": { "messages": [ @@ -17422,10 +17422,10 @@ ] }, "predict": { - "A": 0.005125155206769705, - "B": 0.005125155206769705, - "C": 0.9766817092895508, - "D": 0.0016638945089653134 + "A": 0.0016823195619508624, + "B": 0.0006588057149201632, + "C": 0.9874969720840454, + "D": 0.0004527901182882488 }, "sample": { "messages": [ @@ -17467,10 +17467,10 @@ ] }, "predict": { - "A": 0.9732704162597656, - "B": 0.0014632528182119131, - "C": 0.0002542752190493047, - "D": 0.0008875077473931015 + "A": 0.93910151720047, + "B": 0.0002780160866677761, + "C": 6.60346049699001e-05, + "D": 0.00024534828844480217 }, "sample": { "messages": [ @@ -17512,10 +17512,10 @@ ] }, "predict": { - "A": 0.028806550428271294, - "B": 0.7429307699203491, - "C": 0.04749397188425064, - "D": 0.14629173278808594 + "A": 0.061068713665008545, + "B": 0.4512401819229126, + "C": 0.27369099855422974, + "D": 0.12928247451782227 }, "sample": { "messages": [ @@ -17550,17 +17550,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "C" ] }, "predict": { - "A": 0.5787137746810913, - "B": 0.060995981097221375, - "C": 0.24124379456043243, - "D": 0.07832039147615433 + "A": 0.38395488262176514, + "B": 0.008482667617499828, + "C": 0.4930078387260437, + "D": 0.04585687071084976 }, "sample": { "messages": [ @@ -17590,7 +17590,7 @@ "prompt_len": 73, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -17602,10 +17602,10 @@ ] }, "predict": { - "A": 0.22834669053554535, - "B": 0.7033573389053345, - "C": 0.027272121980786324, - "D": 0.010032853111624718 + "A": 0.3713730573654175, + "B": 0.4768524467945099, + "C": 0.044354185461997986, + "D": 0.0077076018787920475 }, "sample": { "messages": [ @@ -17647,10 +17647,10 @@ ] }, "predict": { - "A": 0.039995498955249786, - "B": 0.03529590368270874, - "C": 0.9102933406829834, - "D": 0.0009406039607711136 + "A": 0.03256161883473396, + "B": 0.0020815948955714703, + "C": 0.9515900611877441, + "D": 0.0003192228905390948 }, "sample": { "messages": [ @@ -17692,10 +17692,10 @@ ] }, "predict": { - "A": 0.012093211524188519, - "B": 0.96068274974823, - "C": 0.010672221891582012, - "D": 0.002101485151797533 + "A": 0.010244512930512428, + "B": 0.9221815466880798, + "C": 0.003325906116515398, + "D": 0.0005779557977803051 }, "sample": { "messages": [ @@ -17737,10 +17737,10 @@ ] }, "predict": { - "A": 0.0002914509386755526, - "B": 0.9844827055931091, - "C": 3.4808850614354014e-05, - "D": 5.064657671027817e-05 + "A": 0.0003469082876108587, + "B": 0.9126068353652954, + "C": 5.3200106776785105e-05, + "D": 6.028361895005219e-05 }, "sample": { "messages": [ @@ -17782,10 +17782,10 @@ ] }, "predict": { - "A": 0.9322513937950134, - "B": 0.011735313571989536, - "C": 0.002039291663095355, - "D": 0.010356378741562366 + "A": 0.882895290851593, + "B": 0.005249887239187956, + "C": 0.001504118088632822, + "D": 0.0036081913858652115 }, "sample": { "messages": [ @@ -17827,10 +17827,10 @@ ] }, "predict": { - "A": 0.007281629368662834, - "B": 0.005004586186259985, - "C": 0.004416532348841429, - "D": 0.9537053108215332 + "A": 0.0015174553263932467, + "B": 0.0004084175161551684, + "C": 0.0013391495449468493, + "D": 0.9481708407402039 }, "sample": { "messages": [ @@ -17872,10 +17872,10 @@ ] }, "predict": { - "A": 0.9558457136154175, - "B": 0.006440437864512205, - "C": 0.0018452162621542811, - "D": 0.001437055878341198 + "A": 0.8888254165649414, + "B": 0.0017158366972580552, + "C": 0.001107827527448535, + "D": 0.0012553329579532146 }, "sample": { "messages": [ @@ -17910,17 +17910,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "C" + "D" ] }, "predict": { - "A": 0.0007814565324224532, - "B": 0.0012884041061624885, - "C": 0.5197793245315552, - "D": 0.4587036073207855 + "A": 0.0004677133110817522, + "B": 0.000364255509339273, + "C": 0.13804782927036285, + "D": 0.7944104671478271 }, "sample": { "messages": [ @@ -17950,7 +17950,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " D" } } { @@ -17962,10 +17962,10 @@ ] }, "predict": { - "A": 0.0027616589795798063, - "B": 0.003129369579255581, - "C": 0.9832186102867126, - "D": 0.0010159574449062347 + "A": 0.0010113781318068504, + "B": 0.00028976472094655037, + "C": 0.9787868857383728, + "D": 0.00018708614516071975 }, "sample": { "messages": [ @@ -18007,10 +18007,10 @@ ] }, "predict": { - "A": 0.020067105069756508, - "B": 0.002115057548508048, - "C": 0.9668872952461243, - "D": 0.0009990823455154896 + "A": 0.0014833604218438268, + "B": 0.0003309824678581208, + "C": 0.9866448044776917, + "D": 0.0003750522737391293 }, "sample": { "messages": [ @@ -18052,10 +18052,10 @@ ] }, "predict": { - "A": 0.9603846073150635, - "B": 0.002697522984817624, - "C": 0.0014438797952607274, - "D": 0.00013430175022222102 + "A": 0.8926074504852295, + "B": 0.00015056610573083162, + "C": 0.0002482415293343365, + "D": 1.490806243964471e-05 }, "sample": { "messages": [ @@ -18097,10 +18097,10 @@ ] }, "predict": { - "A": 0.49991917610168457, - "B": 0.4411771297454834, - "C": 0.004901035688817501, - "D": 0.0014041701797395945 + "A": 0.8531925678253174, + "B": 0.012170137837529182, + "C": 0.0012827231548726559, + "D": 0.0002688733802642673 }, "sample": { "messages": [ @@ -18142,10 +18142,10 @@ ] }, "predict": { - "A": 0.43566569685935974, - "B": 0.007979495450854301, - "C": 0.4936739206314087, - "D": 0.01914181560277939 + "A": 0.1746917963027954, + "B": 0.0018230723217129707, + "C": 0.7829142808914185, + "D": 0.008697392418980598 }, "sample": { "messages": [ @@ -18187,10 +18187,10 @@ ] }, "predict": { - "A": 0.0013020688202232122, - "B": 0.9813747406005859, - "C": 0.00017621584993321449, - "D": 0.0027564798947423697 + "A": 0.001395474886521697, + "B": 0.9281884431838989, + "C": 0.0001774147094693035, + "D": 0.0017918252851814032 }, "sample": { "messages": [ @@ -18232,10 +18232,10 @@ ] }, "predict": { - "A": 0.0031127817928791046, - "B": 0.00044843871728517115, - "C": 0.0006945554632693529, - "D": 0.9780069589614868 + "A": 0.0011845872504636645, + "B": 7.113999163266271e-05, + "C": 0.0002058503741864115, + "D": 0.950410783290863 }, "sample": { "messages": [ @@ -18277,10 +18277,10 @@ ] }, "predict": { - "A": 0.022291280329227448, - "B": 0.0014250337844714522, - "C": 0.000434610788943246, - "D": 0.9478493332862854 + "A": 0.006367604248225689, + "B": 0.00018063501920551062, + "C": 0.00014067870506551117, + "D": 0.9450362920761108 }, "sample": { "messages": [ @@ -18322,10 +18322,10 @@ ] }, "predict": { - "A": 0.0008943831780925393, - "B": 0.9808102250099182, - "C": 0.00025624505360610783, - "D": 0.0013013198040425777 + "A": 0.0020078527741134167, + "B": 0.9178792834281921, + "C": 0.00032777292653918266, + "D": 0.0010747261112555861 }, "sample": { "messages": [ @@ -18360,17 +18360,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.19169560074806213, - "B": 0.7581707835197449, - "C": 0.007432845421135426, - "D": 0.0065594627521932125 + "A": 0.5531198382377625, + "B": 0.3354841470718384, + "C": 0.0042231217958033085, + "D": 0.004495490342378616 }, "sample": { "messages": [ @@ -18400,7 +18400,7 @@ "prompt_len": 67, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -18412,10 +18412,10 @@ ] }, "predict": { - "A": 0.9488899111747742, - "B": 0.007244863547384739, - "C": 0.0011110358173027635, - "D": 0.003422231413424015 + "A": 0.8776663541793823, + "B": 0.001495209988206625, + "C": 0.00042838481022045016, + "D": 0.0008519448456354439 }, "sample": { "messages": [ @@ -18457,10 +18457,10 @@ ] }, "predict": { - "A": 0.8296529650688171, - "B": 0.036452408879995346, - "C": 0.0018148585222661495, - "D": 0.11228132247924805 + "A": 0.6964738368988037, + "B": 0.009934665635228157, + "C": 0.0017263861373066902, + "D": 0.25621843338012695 }, "sample": { "messages": [ @@ -18502,10 +18502,10 @@ ] }, "predict": { - "A": 0.9756787419319153, - "B": 0.000539632688742131, - "C": 0.00013644051796291023, - "D": 4.429574983078055e-05 + "A": 0.8804181218147278, + "B": 7.46755104046315e-05, + "C": 3.112938429694623e-05, + "D": 2.424359081487637e-05 }, "sample": { "messages": [ @@ -18547,10 +18547,10 @@ ] }, "predict": { - "A": 0.9081377387046814, - "B": 0.06578528136014938, - "C": 0.0025507721584290266, - "D": 0.0022510485723614693 + "A": 0.9453055262565613, + "B": 0.005620991811156273, + "C": 0.0008620070293545723, + "D": 0.0004911568248644471 }, "sample": { "messages": [ @@ -18592,10 +18592,10 @@ ] }, "predict": { - "A": 0.959997296333313, - "B": 0.0009919629665091634, - "C": 0.0006016559200361371, - "D": 0.0001184730717795901 + "A": 0.8931159973144531, + "B": 3.578295581974089e-05, + "C": 2.1703461243305355e-05, + "D": 5.487494490807876e-06 }, "sample": { "messages": [ @@ -18637,10 +18637,10 @@ ] }, "predict": { - "A": 0.10225667804479599, - "B": 0.015681570395827293, - "C": 0.8561846613883972, - "D": 0.0021222697105258703 + "A": 0.07379207015037537, + "B": 0.0022283275611698627, + "C": 0.898971438407898, + "D": 0.0011204733746126294 }, "sample": { "messages": [ @@ -18682,10 +18682,10 @@ ] }, "predict": { - "A": 0.015552004799246788, - "B": 0.005721262656152248, - "C": 0.00038933681207709014, - "D": 0.9621684551239014 + "A": 0.0026647334452718496, + "B": 0.00036063246079720557, + "C": 5.530477574211545e-05, + "D": 0.9487107992172241 }, "sample": { "messages": [ @@ -18727,10 +18727,10 @@ ] }, "predict": { - "A": 0.0006218900089152157, - "B": 0.0011618435382843018, - "C": 0.9922826886177063, - "D": 0.0004274183593224734 + "A": 0.00033251135027967393, + "B": 0.0002146854531019926, + "C": 0.9912023544311523, + "D": 0.00013861135812476277 }, "sample": { "messages": [ @@ -18772,10 +18772,10 @@ ] }, "predict": { - "A": 0.009611577726900578, - "B": 0.9804075360298157, - "C": 0.0004223032156005502, - "D": 0.00022604262630920857 + "A": 0.013455142267048359, + "B": 0.9432783126831055, + "C": 0.0012515231501311064, + "D": 0.0002623335167299956 }, "sample": { "messages": [ @@ -18817,10 +18817,10 @@ ] }, "predict": { - "A": 0.5333596467971802, - "B": 0.41538092494010925, - "C": 0.011069525964558125, - "D": 0.00976882316172123 + "A": 0.7334787249565125, + "B": 0.16366124153137207, + "C": 0.0036157474387437105, + "D": 0.002645337488502264 }, "sample": { "messages": [ @@ -18862,10 +18862,10 @@ ] }, "predict": { - "A": 0.004949467722326517, - "B": 0.9432016611099243, - "C": 0.017275340855121613, - "D": 0.00021746443235315382 + "A": 0.004070146009325981, + "B": 0.7756325602531433, + "C": 0.10497044771909714, + "D": 0.0002444312849547714 }, "sample": { "messages": [ @@ -18907,10 +18907,10 @@ ] }, "predict": { - "A": 0.002393868286162615, - "B": 0.9657553434371948, - "C": 0.0005341441719792783, - "D": 0.000880654901266098 + "A": 0.0020316867157816887, + "B": 0.8725032210350037, + "C": 0.0005468210438266397, + "D": 0.0012322802795097232 }, "sample": { "messages": [ @@ -18952,10 +18952,10 @@ ] }, "predict": { - "A": 0.09755871444940567, - "B": 0.0460834763944149, - "C": 0.8168491125106812, - "D": 0.01921045035123825 + "A": 0.09873582422733307, + "B": 0.01715771295130253, + "C": 0.8267049193382263, + "D": 0.019442236050963402 }, "sample": { "messages": [ @@ -18997,10 +18997,10 @@ ] }, "predict": { - "A": 0.004549843724817038, - "B": 0.9824925661087036, - "C": 0.0003508462104946375, - "D": 0.0009536988800391555 + "A": 0.0034278545062988997, + "B": 0.9504490494728088, + "C": 0.00029952259501442313, + "D": 0.0011128613259643316 }, "sample": { "messages": [ @@ -19042,10 +19042,10 @@ ] }, "predict": { - "A": 0.8866171836853027, - "B": 0.020851237699389458, - "C": 0.011160863563418388, - "D": 0.011160863563418388 + "A": 0.8266710638999939, + "B": 0.0038282466121017933, + "C": 0.0038282466121017933, + "D": 0.0007081543444655836 }, "sample": { "messages": [ @@ -19087,10 +19087,10 @@ ] }, "predict": { - "A": 0.12800675630569458, - "B": 0.3942885398864746, - "C": 0.005624224431812763, - "D": 0.4467874765396118 + "A": 0.16080768406391144, + "B": 0.09753479063510895, + "C": 0.0022937986068427563, + "D": 0.7206900715827942 }, "sample": { "messages": [ @@ -19132,10 +19132,10 @@ ] }, "predict": { - "A": 0.11607036739587784, - "B": 0.0016556548653170466, - "C": 0.8576504588127136, - "D": 0.00019773977692238986 + "A": 0.13724128901958466, + "B": 0.0002196413406636566, + "C": 0.789769172668457, + "D": 4.0629565773997456e-05 }, "sample": { "messages": [ @@ -19177,10 +19177,10 @@ ] }, "predict": { - "A": 0.20134060084819794, - "B": 0.7027477025985718, - "C": 0.05090687423944473, - "D": 0.027248485013842583 + "A": 0.2270066738128662, + "B": 0.6170680522918701, + "C": 0.0736982673406601, + "D": 0.023926323279738426 }, "sample": { "messages": [ @@ -19222,10 +19222,10 @@ ] }, "predict": { - "A": 0.9420657753944397, - "B": 0.0365278534591198, - "C": 0.0006690309965051711, - "D": 0.005601727869361639 + "A": 0.9470464587211609, + "B": 0.004385695327073336, + "C": 0.0002633814583532512, + "D": 0.0007159450906328857 }, "sample": { "messages": [ @@ -19267,10 +19267,10 @@ ] }, "predict": { - "A": 0.9457727670669556, - "B": 0.009272030554711819, - "C": 0.0038651570212095976, - "D": 0.0026564812287688255 + "A": 0.9305631518363953, + "B": 0.0008485637372359633, + "C": 0.0005478739039972425, + "D": 0.00029325581272132695 }, "sample": { "messages": [ @@ -19312,10 +19312,10 @@ ] }, "predict": { - "A": 0.013780665583908558, - "B": 0.9660992622375488, - "C": 0.0005687960074283183, - "D": 0.001131185912527144 + "A": 0.0048967162147164345, + "B": 0.933148980140686, + "C": 0.0006626984686590731, + "D": 0.0024622236378490925 }, "sample": { "messages": [ @@ -19357,10 +19357,10 @@ ] }, "predict": { - "A": 0.001566838240250945, - "B": 0.9790274500846863, - "C": 0.00015513798280153424, - "D": 0.0001286139158764854 + "A": 0.006032378412783146, + "B": 0.953025221824646, + "C": 0.0003197043261025101, + "D": 0.0005271033151075244 }, "sample": { "messages": [ @@ -19402,10 +19402,10 @@ ] }, "predict": { - "A": 0.019872594624757767, - "B": 0.0050245788879692554, - "C": 0.9575152397155762, - "D": 0.007310718297958374 + "A": 0.012272282503545284, + "B": 0.0006923548062331975, + "C": 0.974908173084259, + "D": 0.0010723400628194213 }, "sample": { "messages": [ @@ -19447,10 +19447,10 @@ ] }, "predict": { - "A": 0.8913450837135315, - "B": 0.07316605746746063, - "C": 0.001518513192422688, - "D": 0.0004929890274070203 + "A": 0.8409549593925476, + "B": 0.010586060583591461, + "C": 0.0009846569737419486, + "D": 0.00011047546286135912 }, "sample": { "messages": [ @@ -19492,10 +19492,10 @@ ] }, "predict": { - "A": 0.532461404800415, - "B": 0.09252791106700897, - "C": 0.2515169680118561, - "D": 0.0816556066274643 + "A": 0.7780846357345581, + "B": 0.016148630529642105, + "C": 0.10530229657888412, + "D": 0.014251116663217545 }, "sample": { "messages": [ @@ -19537,10 +19537,10 @@ ] }, "predict": { - "A": 0.8511282205581665, - "B": 0.007363701704889536, - "C": 0.005734856706112623, - "D": 0.10165277123451233 + "A": 0.8857169151306152, + "B": 0.000974235066678375, + "C": 0.0024877965915948153, + "D": 0.02674633264541626 }, "sample": { "messages": [ @@ -19582,10 +19582,10 @@ ] }, "predict": { - "A": 0.06903504580259323, - "B": 0.05376454442739487, - "C": 0.8410190343856812, - "D": 0.009342878125607967 + "A": 0.2696048319339752, + "B": 0.08752787858247757, + "C": 0.5707534551620483, + "D": 0.008141360245645046 }, "sample": { "messages": [ @@ -19627,10 +19627,10 @@ ] }, "predict": { - "A": 0.9549397230148315, - "B": 0.0007684715674258769, - "C": 0.00028270488837733865, - "D": 0.00041133316699415445 + "A": 0.8928769826889038, + "B": 6.27842455287464e-05, + "C": 9.135052823694423e-05, + "D": 9.135052823694423e-05 }, "sample": { "messages": [ @@ -19672,10 +19672,10 @@ ] }, "predict": { - "A": 0.10566920042037964, - "B": 0.0726253092288971, - "C": 0.004361489322036505, - "D": 0.780795693397522 + "A": 0.013552798889577389, + "B": 0.001618651207536459, + "C": 0.0005254990537650883, + "D": 0.9501245617866516 }, "sample": { "messages": [ @@ -19717,10 +19717,10 @@ ] }, "predict": { - "A": 0.0031890065874904394, - "B": 0.002483600750565529, - "C": 0.1972968429327011, - "D": 0.7803241610527039 + "A": 0.003037427319213748, + "B": 0.001266188221052289, + "C": 0.18791897594928741, + "D": 0.7432339191436768 }, "sample": { "messages": [ @@ -19762,10 +19762,10 @@ ] }, "predict": { - "A": 0.07330308854579926, - "B": 0.012738166376948357, - "C": 0.003649546066299081, - "D": 0.8930144309997559 + "A": 0.02790714055299759, + "B": 0.0009549298556521535, + "C": 0.0008970735361799598, + "D": 0.9241575598716736 }, "sample": { "messages": [ @@ -19807,10 +19807,10 @@ ] }, "predict": { - "A": 0.004396415781229734, - "B": 0.028668232262134552, - "C": 0.00020562308782245964, - "D": 0.9493614435195923 + "A": 0.004684350453317165, + "B": 0.06466546654701233, + "C": 0.00021908999769948423, + "D": 0.8926791548728943 }, "sample": { "messages": [ @@ -19852,10 +19852,10 @@ ] }, "predict": { - "A": 0.07925999164581299, - "B": 0.04807361960411072, - "C": 0.008353942073881626, - "D": 0.8521252274513245 + "A": 0.07214481383562088, + "B": 0.023421991616487503, + "C": 0.0038235350511968136, + "D": 0.8789037466049194 }, "sample": { "messages": [ @@ -19897,10 +19897,10 @@ ] }, "predict": { - "A": 0.02902710996568203, - "B": 0.0009932530811056495, - "C": 0.9612458348274231, - "D": 0.00017260150343645364 + "A": 0.09177903085947037, + "B": 0.0005457380320876837, + "C": 0.8707752227783203, + "D": 0.00018860203272197396 }, "sample": { "messages": [ @@ -19942,10 +19942,10 @@ ] }, "predict": { - "A": 0.005154638551175594, - "B": 0.005154638551175594, - "C": 0.9823002219200134, - "D": 0.0008957418613135815 + "A": 0.003134089522063732, + "B": 0.0015759192174300551, + "C": 0.9847016334533691, + "D": 0.00017681322060525417 }, "sample": { "messages": [ @@ -19987,10 +19987,10 @@ ] }, "predict": { - "A": 0.9559175968170166, - "B": 0.00046657887287437916, - "C": 0.0001716447586659342, - "D": 0.00046657887287437916 + "A": 0.851171612739563, + "B": 9.867869812296703e-05, + "C": 8.18075131974183e-05, + "D": 0.00019624600827228278 }, "sample": { "messages": [ @@ -20032,10 +20032,10 @@ ] }, "predict": { - "A": 0.9664939641952515, - "B": 0.0016465383814647794, - "C": 0.0004163100675214082, - "D": 0.0004163100675214082 + "A": 0.8927122354507446, + "B": 0.00011727476521627977, + "C": 0.00010349461808800697, + "D": 0.0001414603611920029 }, "sample": { "messages": [ @@ -20077,10 +20077,10 @@ ] }, "predict": { - "A": 0.044187482446432114, - "B": 0.03441324830055237, - "C": 0.8875293731689453, - "D": 0.01265992596745491 + "A": 0.0319773405790329, + "B": 0.009161661379039288, + "C": 0.9345148205757141, + "D": 0.0038191485218703747 }, "sample": { "messages": [ @@ -20122,10 +20122,10 @@ ] }, "predict": { - "A": 0.06961754709482193, - "B": 0.8481153249740601, - "C": 0.04784739762544632, - "D": 0.010676196776330471 + "A": 0.02976234257221222, + "B": 0.8697831630706787, + "C": 0.0433039516210556, + "D": 0.01094895415008068 }, "sample": { "messages": [ @@ -20167,10 +20167,10 @@ ] }, "predict": { - "A": 0.002753392793238163, - "B": 0.002429860644042492, - "C": 0.0006961666513234377, - "D": 0.9802756905555725 + "A": 0.0014680211897939444, + "B": 0.000785775191616267, + "C": 0.0003951125545427203, + "D": 0.9764420986175537 }, "sample": { "messages": [ @@ -20205,17 +20205,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.23399068415164948, - "B": 0.49535825848579407, - "C": 0.20649604499340057, - "D": 0.02794620208442211 + "A": 0.3786712884902954, + "B": 0.2949095070362091, + "C": 0.22967574000358582, + "D": 0.024207644164562225 }, "sample": { "messages": [ @@ -20245,7 +20245,7 @@ "prompt_len": 109, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -20257,10 +20257,10 @@ ] }, "predict": { - "A": 0.6076154708862305, - "B": 0.17408475279808044, - "C": 0.0235598087310791, - "D": 0.17408475279808044 + "A": 0.833045482635498, + "B": 0.060345616191625595, + "C": 0.00410657050088048, + "D": 0.028505248948931694 }, "sample": { "messages": [ @@ -20302,10 +20302,10 @@ ] }, "predict": { - "A": 0.0006958639132790267, - "B": 0.006602173205465078, - "C": 6.88998625264503e-05, - "D": 0.9798493385314941 + "A": 0.0001536832278361544, + "B": 0.00034633048926480114, + "C": 1.9538631022442132e-05, + "D": 0.9698469638824463 }, "sample": { "messages": [ @@ -20347,10 +20347,10 @@ ] }, "predict": { - "A": 0.8567038774490356, - "B": 0.04265277460217476, - "C": 0.01384732872247696, - "D": 0.0620594248175621 + "A": 0.9323912262916565, + "B": 0.007118894252926111, + "C": 0.00806676410138607, + "D": 0.01035793125629425 }, "sample": { "messages": [ @@ -20392,10 +20392,10 @@ ] }, "predict": { - "A": 0.029189888387918472, - "B": 0.1903419941663742, - "C": 0.40295398235321045, - "D": 0.3556056320667267 + "A": 0.015171067789196968, + "B": 0.268913596868515, + "C": 0.3452919125556946, + "D": 0.3452919125556946 }, "sample": { "messages": [ @@ -20437,10 +20437,10 @@ ] }, "predict": { - "A": 0.000781719689257443, - "B": 0.9714025855064392, - "C": 0.00013584252155851573, - "D": 0.000418424402596429 + "A": 0.0012983402702957392, + "B": 0.8635801672935486, + "C": 0.00016506543033756316, + "D": 0.00044869439443573356 }, "sample": { "messages": [ @@ -20482,10 +20482,10 @@ ] }, "predict": { - "A": 0.0017793044680729508, - "B": 0.92170250415802, - "C": 0.0006149111432023346, - "D": 0.05199889838695526 + "A": 0.0016812392277643085, + "B": 0.5282296538352966, + "C": 0.0005127489566802979, + "D": 0.4113856554031372 }, "sample": { "messages": [ @@ -20527,10 +20527,10 @@ ] }, "predict": { - "A": 0.0031438772566616535, - "B": 0.0006190660642459989, - "C": 0.9877768754959106, - "D": 0.0002424297999823466 + "A": 0.0005482989945448935, + "B": 0.0001223420404130593, + "C": 0.9913478493690491, + "D": 5.428895019576885e-05 }, "sample": { "messages": [ @@ -20572,10 +20572,10 @@ ] }, "predict": { - "A": 0.007353310938924551, - "B": 0.9630936980247498, - "C": 0.0011276667937636375, - "D": 0.013737794011831284 + "A": 0.02190460078418255, + "B": 0.9314072728157043, + "C": 0.0009041107841767371, + "D": 0.019330741837620735 }, "sample": { "messages": [ @@ -20617,10 +20617,10 @@ ] }, "predict": { - "A": 0.017044445499777794, - "B": 0.9305951595306396, - "C": 0.028101539239287376, - "D": 0.002035668585449457 + "A": 0.07037447392940521, + "B": 0.7565969228744507, + "C": 0.07037447392940521, + "D": 0.0008322108769789338 }, "sample": { "messages": [ @@ -20662,10 +20662,10 @@ ] }, "predict": { - "A": 0.15847019851207733, - "B": 0.8047780990600586, - "C": 0.004223087802529335, - "D": 0.0009422982693649828 + "A": 0.15249627828598022, + "B": 0.7744399905204773, + "C": 0.0031649593729525805, + "D": 0.0005166654009371996 }, "sample": { "messages": [ @@ -20707,10 +20707,10 @@ ] }, "predict": { - "A": 0.004707267042249441, - "B": 0.3100109398365021, - "C": 0.6562930941581726, - "D": 0.0014356353785842657 + "A": 0.006092119961977005, + "B": 0.2147546261548996, + "C": 0.7495672702789307, + "D": 0.0009945111814886332 }, "sample": { "messages": [ @@ -20752,10 +20752,10 @@ ] }, "predict": { - "A": 0.001474216696806252, - "B": 0.0007412827108055353, - "C": 0.00013712352665606886, - "D": 0.9805629253387451 + "A": 0.0006791757768951356, + "B": 9.784453868633136e-05, + "C": 4.078767233295366e-05, + "D": 0.9563506841659546 }, "sample": { "messages": [ @@ -20797,10 +20797,10 @@ ] }, "predict": { - "A": 0.06457783281803131, - "B": 0.026920044794678688, - "C": 0.004128322936594486, - "D": 0.891469419002533 + "A": 0.009552771225571632, + "B": 0.0006920003797858953, + "C": 0.0005062783020548522, + "D": 0.9744091033935547 }, "sample": { "messages": [ @@ -20842,10 +20842,10 @@ ] }, "predict": { - "A": 0.35528624057769775, - "B": 0.14810533821582794, - "C": 0.1901710331439972, - "D": 0.27669718861579895 + "A": 0.6712204813957214, + "B": 0.015785593539476395, + "C": 0.1166406199336052, + "D": 0.14976952970027924 }, "sample": { "messages": [ @@ -20887,10 +20887,10 @@ ] }, "predict": { - "A": 0.08845106512308121, - "B": 0.04734444618225098, - "C": 0.006407373119145632, - "D": 0.8392003178596497 + "A": 0.08169806748628616, + "B": 0.0020452712196856737, + "C": 0.0027955544646829367, + "D": 0.878337025642395 }, "sample": { "messages": [ @@ -20932,10 +20932,10 @@ ] }, "predict": { - "A": 0.9508540034294128, - "B": 0.00725985923781991, - "C": 0.00207998463883996, - "D": 0.009321845136582851 + "A": 0.9003502130508423, + "B": 0.0017380848294124007, + "C": 0.0013536217156797647, + "D": 0.0009303297265432775 }, "sample": { "messages": [ @@ -20977,10 +20977,10 @@ ] }, "predict": { - "A": 3.5138924431521446e-05, - "B": 5.112683720653877e-05, - "C": 0.9938181042671204, - "D": 0.00020221054728608578 + "A": 1.4516645023832098e-05, + "B": 1.4516645023832098e-05, + "C": 0.9849005937576294, + "D": 6.111736001912504e-05 }, "sample": { "messages": [ @@ -21022,10 +21022,10 @@ ] }, "predict": { - "A": 0.12731149792671204, - "B": 0.8301751613616943, - "C": 0.0026422657538205385, - "D": 0.015205188654363155 + "A": 0.36844465136528015, + "B": 0.5360837578773499, + "C": 0.002482560696080327, + "D": 0.016188327223062515 }, "sample": { "messages": [ @@ -21067,10 +21067,10 @@ ] }, "predict": { - "A": 0.002459116280078888, - "B": 0.9319713711738586, - "C": 0.0016901242779567838, - "D": 0.040947962552309036 + "A": 0.0008905461872927845, + "B": 0.9174331426620483, + "C": 0.0009479814907535911, + "D": 0.03139283508062363 }, "sample": { "messages": [ @@ -21112,10 +21112,10 @@ ] }, "predict": { - "A": 0.0004277722619008273, - "B": 0.0004847295058425516, - "C": 0.9931042790412903, - "D": 5.1090111810481176e-05 + "A": 0.00016732016229070723, + "B": 7.424787327181548e-05, + "C": 0.9919312596321106, + "D": 1.2120631254219916e-05 }, "sample": { "messages": [ @@ -21157,10 +21157,10 @@ ] }, "predict": { - "A": 0.9143650531768799, - "B": 0.04017439857125282, - "C": 0.006160943303257227, - "D": 0.00423435028642416 + "A": 0.8645030856132507, + "B": 0.013973390683531761, + "C": 0.0010122290113940835, + "D": 0.000893288990482688 }, "sample": { "messages": [ @@ -21202,10 +21202,10 @@ ] }, "predict": { - "A": 0.732759952545166, - "B": 0.006339615676552057, - "C": 0.23789231479167938, - "D": 0.0002169297367800027 + "A": 0.55574631690979, + "B": 0.0015609771944582462, + "C": 0.3819585144519806, + "D": 0.00022488007380161434 }, "sample": { "messages": [ @@ -21247,10 +21247,10 @@ ] }, "predict": { - "A": 0.07421859353780746, - "B": 0.008864147588610649, - "C": 0.9041675329208374, - "D": 0.0025396207347512245 + "A": 0.08356883376836777, + "B": 0.0009882403537631035, + "C": 0.8984496593475342, + "D": 0.0015306167770177126 }, "sample": { "messages": [ @@ -21292,10 +21292,10 @@ ] }, "predict": { - "A": 0.05715872719883919, - "B": 0.8941129446029663, - "C": 0.006826636381447315, - "D": 0.03059486299753189 + "A": 0.1586676687002182, + "B": 0.7110990881919861, + "C": 0.016723448410630226, + "D": 0.09623680263757706 }, "sample": { "messages": [ @@ -21337,10 +21337,10 @@ ] }, "predict": { - "A": 0.017337171360850334, - "B": 0.9465774297714233, - "C": 0.0016126082045957446, - "D": 0.0020706297364085913 + "A": 0.012997548095881939, + "B": 0.9111984968185425, + "C": 0.0008309054537676275, + "D": 0.002559369197115302 }, "sample": { "messages": [ @@ -21382,10 +21382,10 @@ ] }, "predict": { - "A": 0.05794718116521835, - "B": 0.018812695518136024, - "C": 0.9064464569091797, - "D": 0.0053899274207651615 + "A": 0.020027659833431244, + "B": 0.002110899891704321, + "C": 0.9649866819381714, + "D": 0.003943679854273796 }, "sample": { "messages": [ @@ -21427,10 +21427,10 @@ ] }, "predict": { - "A": 0.3228079676628113, - "B": 0.05609561502933502, - "C": 0.5322203636169434, - "D": 0.07202819734811783 + "A": 0.24421162903308868, + "B": 0.05449097976088524, + "C": 0.5858332514762878, + "D": 0.07928390800952911 }, "sample": { "messages": [ @@ -21472,10 +21472,10 @@ ] }, "predict": { - "A": 0.771384060382843, - "B": 0.17211905121803284, - "C": 0.0019120699726045132, - "D": 0.03840494900941849 + "A": 0.812646746635437, + "B": 0.12462342530488968, + "C": 0.0007888307445682585, + "D": 0.011591782793402672 }, "sample": { "messages": [ @@ -21517,10 +21517,10 @@ ] }, "predict": { - "A": 0.14516954123973846, - "B": 0.00341406068764627, - "C": 0.0014231923269107938, - "D": 0.8353930711746216 + "A": 0.05807047709822655, + "B": 0.0004165109130553901, + "C": 0.0003912757965736091, + "D": 0.908375084400177 }, "sample": { "messages": [ @@ -21562,10 +21562,10 @@ ] }, "predict": { - "A": 0.0030556274577975273, - "B": 0.960049569606781, - "C": 0.00733006838709116, - "D": 0.0018533316906541586 + "A": 0.004233956336975098, + "B": 0.9142799377441406, + "C": 0.006160369608551264, + "D": 0.00100565270986408 }, "sample": { "messages": [ @@ -21607,10 +21607,10 @@ ] }, "predict": { - "A": 0.0006221202202141285, - "B": 0.0007049546111375093, - "C": 0.9926499724388123, - "D": 9.540523024043068e-05 + "A": 0.00035457563353702426, + "B": 9.543274063616991e-05, + "C": 0.9929361939430237, + "D": 2.000378844968509e-05 }, "sample": { "messages": [ @@ -21652,10 +21652,10 @@ ] }, "predict": { - "A": 0.25423476099967957, - "B": 0.030364008620381355, - "C": 0.6910816431045532, - "D": 0.004109321627765894 + "A": 0.05892331525683403, + "B": 0.003766841720789671, + "C": 0.9217157363891602, + "D": 0.0012229144340381026 }, "sample": { "messages": [ @@ -21697,10 +21697,10 @@ ] }, "predict": { - "A": 0.6298760771751404, - "B": 0.33714836835861206, - "C": 0.003305286169052124, - "D": 0.0015613066498190165 + "A": 0.608361542224884, + "B": 0.3256324827671051, + "C": 0.0015079776057973504, + "D": 0.0009736234787851572 }, "sample": { "messages": [ @@ -21742,10 +21742,10 @@ ] }, "predict": { - "A": 0.8755488395690918, - "B": 0.029959632083773613, - "C": 0.0013163343537598848, - "D": 0.043591007590293884 + "A": 0.8813492655754089, + "B": 0.0028051414992660284, + "C": 0.0003147281240671873, + "D": 0.007625164929777384 }, "sample": { "messages": [ @@ -21787,10 +21787,10 @@ ] }, "predict": { - "A": 0.0008463517879135907, - "B": 0.0016831717221066356, - "C": 0.9879971742630005, - "D": 0.0009009368368424475 + "A": 0.0002580544096417725, + "B": 0.00031127306283451617, + "C": 0.9877358078956604, + "D": 0.0001887966354843229 }, "sample": { "messages": [ @@ -21832,10 +21832,10 @@ ] }, "predict": { - "A": 0.8395743370056152, - "B": 0.11362402886152267, - "C": 0.00641022901982069, - "D": 0.013570455834269524 + "A": 0.9036057591438293, + "B": 0.01655011810362339, + "C": 0.0010580135276541114, + "D": 0.0028759792912751436 }, "sample": { "messages": [ @@ -21877,10 +21877,10 @@ ] }, "predict": { - "A": 0.045434676110744476, - "B": 0.9125798940658569, - "C": 0.006967633031308651, - "D": 0.000445425946963951 + "A": 0.13197936117649078, + "B": 0.7594888210296631, + "C": 0.003985431510955095, + "D": 0.00034824313479475677 }, "sample": { "messages": [ @@ -21922,10 +21922,10 @@ ] }, "predict": { - "A": 0.8197869062423706, - "B": 0.02475542016327381, - "C": 0.013250621035695076, - "D": 0.0979095846414566 + "A": 0.8690299391746521, + "B": 0.0033363508991897106, + "C": 0.0033363508991897106, + "D": 0.02973656915128231 }, "sample": { "messages": [ @@ -21967,10 +21967,10 @@ ] }, "predict": { - "A": 0.07402350753545761, - "B": 0.05764956399798393, - "C": 0.2280084788799286, - "D": 0.6197912693023682 + "A": 0.12896554172039032, + "B": 0.0537608340382576, + "C": 0.3093722462654114, + "D": 0.45013394951820374 }, "sample": { "messages": [ @@ -22012,10 +22012,10 @@ ] }, "predict": { - "A": 0.01661263406276703, - "B": 0.9070190191268921, - "C": 0.02738960087299347, - "D": 0.021331043913960457 + "A": 0.0022434680722653866, + "B": 0.5489584803581238, + "C": 0.3329601585865021, + "D": 0.0129102673381567 }, "sample": { "messages": [ @@ -22057,10 +22057,10 @@ ] }, "predict": { - "A": 0.04612695053219795, - "B": 0.02468997985124588, - "C": 0.43764033913612366, - "D": 0.43764033913612366 + "A": 0.02010645531117916, + "B": 0.007396751549094915, + "C": 0.7544891238212585, + "D": 0.14856770634651184 }, "sample": { "messages": [ @@ -22102,10 +22102,10 @@ ] }, "predict": { - "A": 0.9605370163917542, - "B": 0.005040435586124659, - "C": 0.002380933379754424, - "D": 0.00028436194406822324 + "A": 0.9021083116531372, + "B": 0.0004989420413039625, + "C": 0.0017414788017049432, + "D": 6.752446643076837e-05 }, "sample": { "messages": [ @@ -22147,10 +22147,10 @@ ] }, "predict": { - "A": 0.1074376031756401, - "B": 0.545613169670105, - "C": 0.15632079541683197, - "D": 0.1074376031756401 + "A": 0.2443484216928482, + "B": 0.5172855854034424, + "C": 0.07932832092046738, + "D": 0.04811505600810051 }, "sample": { "messages": [ @@ -22192,10 +22192,10 @@ ] }, "predict": { - "A": 0.11345077306032181, - "B": 0.8382940888404846, - "C": 0.010552565567195415, - "D": 0.01971478760242462 + "A": 0.24440167844295502, + "B": 0.6643525958061218, + "C": 0.013788197189569473, + "D": 0.0257597453892231 }, "sample": { "messages": [ @@ -22237,10 +22237,10 @@ ] }, "predict": { - "A": 0.006049268413335085, - "B": 0.005015019793063402, - "C": 0.9556936025619507, - "D": 0.019834786653518677 + "A": 0.002752418164163828, + "B": 0.0010778623400256038, + "C": 0.9799286723136902, + "D": 0.0013840027386322618 }, "sample": { "messages": [ @@ -22282,10 +22282,10 @@ ] }, "predict": { - "A": 0.005176869686692953, - "B": 0.0008996051037684083, - "C": 0.9865368008613586, - "D": 6.121888873167336e-05 + "A": 0.001313381246291101, + "B": 7.409590762108564e-05, + "C": 0.9899010062217712, + "D": 1.8734377590590157e-05 }, "sample": { "messages": [ @@ -22327,10 +22327,10 @@ ] }, "predict": { - "A": 0.9447795748710632, - "B": 0.004957748111337423, - "C": 0.007213480770587921, - "D": 0.015270939096808434 + "A": 0.9233699440956116, + "B": 0.0002909889444708824, + "C": 0.0062216175720095634, + "D": 0.002593556186184287 }, "sample": { "messages": [ @@ -22372,10 +22372,10 @@ ] }, "predict": { - "A": 0.025268329307436943, - "B": 0.9481870532035828, - "C": 0.0034196965862065554, - "D": 0.0012580360053107142 + "A": 0.06173305958509445, + "B": 0.8521985411643982, + "C": 0.004471927415579557, + "D": 0.0014518224634230137 }, "sample": { "messages": [ @@ -22417,10 +22417,10 @@ ] }, "predict": { - "A": 0.011221333406865597, - "B": 0.011945047415792942, - "C": 0.000921103113796562, - "D": 0.9489126801490784 + "A": 0.001737586804665625, + "B": 0.0009900471195578575, + "C": 0.00036421799450181425, + "D": 0.9581432342529297 }, "sample": { "messages": [ @@ -22462,10 +22462,10 @@ ] }, "predict": { - "A": 0.0011577793629840016, - "B": 0.0024510191287845373, - "C": 0.9888116121292114, - "D": 0.0004259232373442501 + "A": 0.0004842152411583811, + "B": 0.00035425942041911185, + "C": 0.9920507073402405, + "D": 8.957081445259973e-05 }, "sample": { "messages": [ @@ -22507,10 +22507,10 @@ ] }, "predict": { - "A": 0.01962178945541382, - "B": 0.9454308152198792, - "C": 0.006370262708514929, - "D": 0.0016106547554954886 + "A": 0.025499096140265465, + "B": 0.8444141149520874, + "C": 0.010629604570567608, + "D": 0.0022280861157923937 }, "sample": { "messages": [ @@ -22552,10 +22552,10 @@ ] }, "predict": { - "A": 0.009275800548493862, - "B": 0.0285714752972126, - "C": 0.9461573362350464, - "D": 0.002345289569348097 + "A": 0.004527069162577391, + "B": 0.0035256845876574516, + "C": 0.9775746464729309, + "D": 0.0007390236714854836 }, "sample": { "messages": [ @@ -22597,10 +22597,10 @@ ] }, "predict": { - "A": 0.0004816949076484889, - "B": 0.9868870377540588, - "C": 0.00017720564210321754, - "D": 0.005178708117455244 + "A": 0.00020362557552289218, + "B": 0.9401389360427856, + "C": 0.00023073804914020002, + "D": 0.009216798469424248 }, "sample": { "messages": [ @@ -22635,17 +22635,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "A" + "D" ] }, "predict": { - "A": 0.7653980255126953, - "B": 0.009634939953684807, - "C": 0.029677702113986015, - "D": 0.1330062299966812 + "A": 0.04775989428162575, + "B": 0.0023778253234922886, + "C": 0.042147956788539886, + "D": 0.8465643525123596 }, "sample": { "messages": [ @@ -22675,7 +22675,7 @@ "prompt_len": 83, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -22687,10 +22687,10 @@ ] }, "predict": { - "A": 0.5771685838699341, - "B": 0.2726351320743561, - "C": 0.07811127603054047, - "D": 0.03256162628531456 + "A": 0.6457507014274597, + "B": 0.23755842447280884, + "C": 0.036430761218070984, + "D": 0.028372304514050484 }, "sample": { "messages": [ @@ -22732,10 +22732,10 @@ ] }, "predict": { - "A": 0.010674336925148964, - "B": 0.012095607817173004, - "C": 0.00030280763166956604, - "D": 0.9608731269836426 + "A": 0.0034336079843342304, + "B": 0.0012631537392735481, + "C": 8.595869439886883e-05, + "D": 0.9520443081855774 }, "sample": { "messages": [ @@ -22777,10 +22777,10 @@ ] }, "predict": { - "A": 0.0019130674190819263, - "B": 0.001160333980806172, - "C": 0.9909934401512146, - "D": 0.0005834525800310075 + "A": 0.0007987264543771744, + "B": 5.785954635939561e-05, + "C": 0.9925359487533569, + "D": 6.979193858569488e-05 }, "sample": { "messages": [ @@ -22822,10 +22822,10 @@ ] }, "predict": { - "A": 0.5649688839912415, - "B": 0.036117248237133026, - "C": 0.003806730266660452, - "D": 0.30240604281425476 + "A": 0.6418025493621826, + "B": 0.009154820814728737, + "C": 0.002972135553136468, + "D": 0.23610597848892212 }, "sample": { "messages": [ @@ -22867,10 +22867,10 @@ ] }, "predict": { - "A": 0.0016620294190943241, - "B": 0.9755869507789612, - "C": 0.00019850108947139233, - "D": 0.00013642768317367882 + "A": 0.0022756149992346764, + "B": 0.9180485606193542, + "C": 7.314951653825119e-05, + "D": 0.00010643192945281044 }, "sample": { "messages": [ @@ -22912,10 +22912,10 @@ ] }, "predict": { - "A": 0.9512475728988647, - "B": 0.009325703606009483, - "C": 0.0023579071275889874, - "D": 0.0012620966881513596 + "A": 0.9128787517547607, + "B": 0.0008861252572387457, + "C": 0.0004743086756207049, + "D": 0.00023849736317060888 }, "sample": { "messages": [ @@ -22957,10 +22957,10 @@ ] }, "predict": { - "A": 0.9485660791397095, - "B": 0.0014261113246902823, - "C": 0.00013264903100207448, - "D": 0.0007633424247615039 + "A": 0.8929081559181213, + "B": 8.581887959735468e-05, + "C": 2.3097838493413292e-05, + "D": 9.72455381997861e-05 }, "sample": { "messages": [ @@ -23002,10 +23002,10 @@ ] }, "predict": { - "A": 0.5264544486999512, - "B": 0.06287601590156555, - "C": 0.28179076313972473, - "D": 0.09148405492305756 + "A": 0.4344106614589691, + "B": 0.02450777404010296, + "C": 0.14103248715400696, + "D": 0.2985658049583435 }, "sample": { "messages": [ @@ -23047,10 +23047,10 @@ ] }, "predict": { - "A": 0.0021495402324944735, - "B": 0.0058430559001863, - "C": 0.9826509356498718, - "D": 0.00012126875662943348 + "A": 0.0004244629817549139, + "B": 0.0001662220893194899, + "C": 0.9854215383529663, + "D": 2.5490984626230784e-05 }, "sample": { "messages": [ @@ -23092,10 +23092,10 @@ ] }, "predict": { - "A": 0.1796226054430008, - "B": 0.05831492319703102, - "C": 0.7104212045669556, - "D": 0.008942883461713791 + "A": 0.11357659846544266, + "B": 0.006019338499754667, + "C": 0.8392238616943359, + "D": 0.0015219271881505847 }, "sample": { "messages": [ @@ -23137,10 +23137,10 @@ ] }, "predict": { - "A": 0.00037433148827403784, - "B": 0.0006993432762101293, - "C": 0.0007924597593955696, - "D": 0.9847486615180969 + "A": 0.0002644546621013433, + "B": 0.00021924062457401305, + "C": 0.0007188623421825469, + "D": 0.950905442237854 }, "sample": { "messages": [ @@ -23182,10 +23182,10 @@ ] }, "predict": { - "A": 0.0014879419468343258, - "B": 0.001158810337074101, - "C": 0.9896921515464783, - "D": 0.00025856553111225367 + "A": 0.0011594487586989999, + "B": 0.00020148199109826237, + "C": 0.9902374148368835, + "D": 0.0001148009832832031 }, "sample": { "messages": [ @@ -23227,10 +23227,10 @@ ] }, "predict": { - "A": 0.9617791771888733, - "B": 0.005046953912824392, - "C": 0.001856670598499477, - "D": 0.003930571489036083 + "A": 0.8492866158485413, + "B": 0.0006031416123732924, + "C": 0.0019776250701397657, + "D": 0.13024233281612396 }, "sample": { "messages": [ @@ -23272,10 +23272,10 @@ ] }, "predict": { - "A": 0.06915783882141113, - "B": 0.01543120015412569, - "C": 0.017485840246081352, - "D": 0.8425149321556091 + "A": 0.1497560292482376, + "B": 0.007004183251410723, + "C": 0.01392948254942894, + "D": 0.7605239152908325 }, "sample": { "messages": [ @@ -23310,17 +23310,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.01846138760447502, - "B": 0.7849981784820557, - "C": 0.15457528829574585, - "D": 0.004119292367249727 + "A": 0.014176180586218834, + "B": 0.2512788474559784, + "C": 0.6830466389656067, + "D": 0.002971488982439041 }, "sample": { "messages": [ @@ -23350,7 +23350,7 @@ "prompt_len": 92, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -23362,10 +23362,10 @@ ] }, "predict": { - "A": 0.0027454823721200228, - "B": 0.00958267506211996, - "C": 0.9774593710899353, - "D": 0.0027454823721200228 + "A": 0.0009594124858267605, + "B": 0.0014859671937301755, + "C": 0.988378643989563, + "D": 0.00027487630723044276 }, "sample": { "messages": [ @@ -23407,10 +23407,10 @@ ] }, "predict": { - "A": 0.017664043232798576, - "B": 0.0005010903114452958, - "C": 0.003069550497457385, - "D": 0.9644240736961365 + "A": 0.010456016287207603, + "B": 0.0003157443134114146, + "C": 0.004639828577637672, + "D": 0.9412205219268799 }, "sample": { "messages": [ @@ -23448,14 +23448,14 @@ "acc": false, "f1_macro": [ "C", - "B" + "A" ] }, "predict": { - "A": 0.1038663387298584, - "B": 0.7674741744995117, - "C": 0.0916617214679718, - "D": 0.015928417444229126 + "A": 0.32912832498550415, + "B": 0.32912832498550415, + "C": 0.2563253939151764, + "D": 0.0060281953774392605 }, "sample": { "messages": [ @@ -23485,7 +23485,7 @@ "prompt_len": 77, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -23497,10 +23497,10 @@ ] }, "predict": { - "A": 0.02213350497186184, - "B": 0.9411405920982361, - "C": 0.0020587367471307516, - "D": 0.008142461068928242 + "A": 0.00791290681809187, + "B": 0.9146077036857605, + "C": 0.001765608205460012, + "D": 0.00543845584616065 }, "sample": { "messages": [ @@ -23542,10 +23542,10 @@ ] }, "predict": { - "A": 0.0006923112669028342, - "B": 0.9748468399047852, - "C": 0.0004758181457873434, - "D": 0.0014656229177489877 + "A": 0.0004199230170343071, + "B": 0.915816605091095, + "C": 0.00021115054551046342, + "D": 0.0009463111055083573 }, "sample": { "messages": [ @@ -23587,10 +23587,10 @@ ] }, "predict": { - "A": 0.8549434542655945, - "B": 0.1021084412932396, - "C": 0.003959174267947674, - "D": 0.017743784934282303 + "A": 0.9147434234619141, + "B": 0.024377087131142616, + "C": 0.0020009931176900864, + "D": 0.006163492798805237 }, "sample": { "messages": [ @@ -23632,10 +23632,10 @@ ] }, "predict": { - "A": 0.9692007303237915, - "B": 0.0012859165435656905, - "C": 0.006530423182994127, - "D": 0.002120118122547865 + "A": 0.9040149450302124, + "B": 0.00014325141091831028, + "C": 0.010042699985206127, + "D": 0.0013591315364465117 }, "sample": { "messages": [ @@ -23677,10 +23677,10 @@ ] }, "predict": { - "A": 0.0035585982259362936, - "B": 0.001019555376842618, - "C": 0.9867005944252014, - "D": 0.0002577839477453381 + "A": 0.003125292481854558, + "B": 0.00025653958437033, + "C": 0.9819376468658447, + "D": 8.865771815180779e-05 }, "sample": { "messages": [ @@ -23722,10 +23722,10 @@ ] }, "predict": { - "A": 0.04054500535130501, - "B": 0.01690167374908924, - "C": 0.9228000640869141, - "D": 0.0009535271092317998 + "A": 0.06434016674757004, + "B": 0.0076843369752168655, + "C": 0.8881886005401611, + "D": 0.0006307687726803124 }, "sample": { "messages": [ @@ -23767,10 +23767,10 @@ ] }, "predict": { - "A": 0.05194710195064545, - "B": 0.002930654911324382, - "C": 0.000478416244732216, - "D": 0.9207843542098999 + "A": 0.024694105610251427, + "B": 0.000399143056711182, + "C": 7.859592733439058e-05, + "D": 0.9266394972801208 }, "sample": { "messages": [ @@ -23812,10 +23812,10 @@ ] }, "predict": { - "A": 0.0012872553197667003, - "B": 0.9702097177505493, - "C": 6.408866465790197e-05, - "D": 0.00015374072245322168 + "A": 0.0002582507731858641, + "B": 0.8723369836807251, + "C": 7.399007881758735e-05, + "D": 0.00024260414647869766 }, "sample": { "messages": [ @@ -23853,14 +23853,14 @@ "acc": false, "f1_macro": [ "D", - "C" + "A" ] }, "predict": { - "A": 0.18291188776493073, - "B": 0.036017484962940216, - "C": 0.7234305739402771, - "D": 0.02805044688284397 + "A": 0.460158109664917, + "B": 0.009550277143716812, + "C": 0.40608811378479004, + "D": 0.012262798845767975 }, "sample": { "messages": [ @@ -23890,7 +23890,7 @@ "prompt_len": 101, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -23902,10 +23902,10 @@ ] }, "predict": { - "A": 0.0027542398311197758, - "B": 0.0045409738086164, - "C": 0.9805772304534912, - "D": 0.0009518397273495793 + "A": 0.001156659098342061, + "B": 0.001156659098342061, + "C": 0.9878548979759216, + "D": 0.00024244895030278713 }, "sample": { "messages": [ @@ -23947,10 +23947,10 @@ ] }, "predict": { - "A": 0.9253196120262146, - "B": 0.013198974542319775, - "C": 0.013198974542319775, - "D": 0.0192043948918581 + "A": 0.8548521995544434, + "B": 0.003083078423514962, + "C": 0.003958751447498798, + "D": 0.0034935856238007545 }, "sample": { "messages": [ @@ -23992,10 +23992,10 @@ ] }, "predict": { - "A": 0.9202086329460144, - "B": 0.014873786829411983, - "C": 0.004828811623156071, - "D": 0.004828811623156071 + "A": 0.8905551433563232, + "B": 0.0009795568184927106, + "C": 0.0007166591240093112, + "D": 0.0008644558256492019 }, "sample": { "messages": [ @@ -24037,10 +24037,10 @@ ] }, "predict": { - "A": 0.9785625338554382, - "B": 0.0021405969746410847, - "C": 0.0003719799860846251, - "D": 0.0011457789223641157 + "A": 0.9618815183639526, + "B": 0.00016225132276304066, + "C": 0.00011870560410898179, + "D": 0.00020833482267335057 }, "sample": { "messages": [ @@ -24082,10 +24082,10 @@ ] }, "predict": { - "A": 0.07853258401155472, - "B": 0.06116124242544174, - "C": 0.8443048596382141, - "D": 0.001846909406594932 + "A": 0.040994707494974136, + "B": 0.011745180003345013, + "C": 0.9330351948738098, + "D": 0.0005160471773706377 }, "sample": { "messages": [ @@ -24127,10 +24127,10 @@ ] }, "predict": { - "A": 0.010407249443233013, - "B": 0.9368306994438171, - "C": 0.002981727011501789, - "D": 0.002049308968707919 + "A": 0.04057982936501503, + "B": 0.8150676488876343, + "C": 0.005491882562637329, + "D": 0.006624475587159395 }, "sample": { "messages": [ @@ -24172,10 +24172,10 @@ ] }, "predict": { - "A": 0.05122843757271767, - "B": 0.10845060646533966, - "C": 0.006933015305548906, - "D": 0.8013476133346558 + "A": 0.02793203666806221, + "B": 0.013194158673286438, + "C": 0.001900798873975873, + "D": 0.9249820113182068 }, "sample": { "messages": [ @@ -24217,10 +24217,10 @@ ] }, "predict": { - "A": 0.927049458026886, - "B": 0.021802114322781563, - "C": 0.008020549081265926, - "D": 0.0037886393256485462 + "A": 0.900501549243927, + "B": 0.0022321203723549843, + "C": 0.0011947678867727518, + "D": 0.0004129007284063846 }, "sample": { "messages": [ @@ -24262,10 +24262,10 @@ ] }, "predict": { - "A": 0.08278024196624756, - "B": 0.00872497446835041, - "C": 0.001718049170449376, - "D": 0.8899714946746826 + "A": 0.057454805821180344, + "B": 0.0028605065308511257, + "C": 0.0018468816997483373, + "D": 0.8987444043159485 }, "sample": { "messages": [ @@ -24307,10 +24307,10 @@ ] }, "predict": { - "A": 0.941654622554779, - "B": 0.002334128599613905, - "C": 0.00035795022267848253, - "D": 9.050398512044922e-05 + "A": 0.8346993923187256, + "B": 0.00023213682288769633, + "C": 0.00018078832363244146, + "D": 5.869338201591745e-05 }, "sample": { "messages": [ @@ -24352,10 +24352,10 @@ ] }, "predict": { - "A": 0.006500444374978542, - "B": 0.012144429609179497, - "C": 0.9647515416145325, - "D": 0.007365968078374863 + "A": 0.006603919435292482, + "B": 0.0024294464383274317, + "C": 0.9801085591316223, + "D": 0.0011475890642032027 }, "sample": { "messages": [ @@ -24397,10 +24397,10 @@ ] }, "predict": { - "A": 0.9519580602645874, - "B": 0.007268289104104042, - "C": 0.001837711432017386, - "D": 0.000361866841558367 + "A": 0.8910019397735596, + "B": 0.0007632624474354088, + "C": 0.000980048323981464, + "D": 0.00020542928541544825 }, "sample": { "messages": [ @@ -24442,10 +24442,10 @@ ] }, "predict": { - "A": 0.0003308241721242666, - "B": 0.9861729741096497, - "C": 0.000257646112004295, - "D": 0.0010190102038905025 + "A": 0.0002485328586772084, + "B": 0.9512908458709717, + "C": 7.579823432024568e-05, + "D": 0.0003616132016759366 }, "sample": { "messages": [ @@ -24487,10 +24487,10 @@ ] }, "predict": { - "A": 0.0016661181580275297, - "B": 0.00746702216565609, - "C": 0.0006129305111244321, - "D": 0.9779869914054871 + "A": 0.0010625275317579508, + "B": 0.0021130884997546673, + "C": 0.0005019024829380214, + "D": 0.9659872055053711 }, "sample": { "messages": [ @@ -24525,17 +24525,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.46042537689208984, - "B": 0.5217303037643433, - "C": 0.0024160908069461584, - "D": 0.0006108833476901054 + "A": 0.9025592803955078, + "B": 0.02122616022825241, + "C": 0.0009326123399659991, + "D": 0.00018364227435085922 }, "sample": { "messages": [ @@ -24565,22 +24565,22 @@ "prompt_len": 78, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "B" + "D" ] }, "predict": { - "A": 0.11571164429187775, - "B": 0.6658746004104614, - "C": 0.06193597987294197, - "D": 0.13111847639083862 + "A": 0.14013846218585968, + "B": 0.23104923963546753, + "C": 0.027594897896051407, + "D": 0.5542583465576172 }, "sample": { "messages": [ @@ -24610,7 +24610,7 @@ "prompt_len": 68, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -24622,10 +24622,10 @@ ] }, "predict": { - "A": 0.0010109244612976909, - "B": 0.0006947974907234311, - "C": 0.00018700220971368253, - "D": 0.9783477783203125 + "A": 0.00042698305333033204, + "B": 0.00015707829152233899, + "C": 0.00011492092744447291, + "D": 0.931213915348053 }, "sample": { "messages": [ @@ -24667,10 +24667,10 @@ ] }, "predict": { - "A": 0.6605915427207947, - "B": 0.31204134225845337, - "C": 0.0021025179885327816, - "D": 0.00185546581633389 + "A": 0.9202011823654175, + "B": 0.04043081775307655, + "C": 0.0011469341116026044, + "D": 0.0005089488695375621 }, "sample": { "messages": [ @@ -24712,10 +24712,10 @@ ] }, "predict": { - "A": 0.01744287833571434, - "B": 0.019765369594097137, - "C": 0.9523488879203796, - "D": 0.0007663865690119565 + "A": 0.007428223732858896, + "B": 0.005785106681287289, + "C": 0.9729053378105164, + "D": 0.00019795546540990472 }, "sample": { "messages": [ @@ -24757,10 +24757,10 @@ ] }, "predict": { - "A": 0.0011194972321391106, - "B": 0.9561164379119873, - "C": 0.0008718653116375208, - "D": 0.0050172386690974236 + "A": 0.0014477368677034974, + "B": 0.8498003482818604, + "C": 0.0009950138628482819, + "D": 0.00572590995579958 }, "sample": { "messages": [ @@ -24802,10 +24802,10 @@ ] }, "predict": { - "A": 0.9179850816726685, - "B": 0.040333449840545654, - "C": 0.004251114558428526, - "D": 0.002921745413914323 + "A": 0.9021130204200745, + "B": 0.008844004943966866, + "C": 0.002380331512540579, + "D": 0.001443744171410799 }, "sample": { "messages": [ @@ -24847,10 +24847,10 @@ ] }, "predict": { - "A": 0.9381186366081238, - "B": 0.0009693557512946427, - "C": 0.00027772507746703923, - "D": 0.036374807357788086 + "A": 0.8951584696769714, + "B": 9.749061428010464e-05, + "C": 0.00010377822036389261, + "D": 0.05722556635737419 }, "sample": { "messages": [ @@ -24892,10 +24892,10 @@ ] }, "predict": { - "A": 0.00035838221083395183, - "B": 0.9427910447120667, - "C": 0.0014174289535731077, - "D": 0.04693880304694176 + "A": 0.00030114786932244897, + "B": 0.8433197736740112, + "C": 0.0034464551135897636, + "D": 0.11413092166185379 }, "sample": { "messages": [ @@ -24937,10 +24937,10 @@ ] }, "predict": { - "A": 0.005745773669332266, - "B": 0.008360051549971104, - "C": 0.0005689086392521858, - "D": 0.9662905335426331 + "A": 0.003447318449616432, + "B": 0.0016283979639410973, + "C": 0.000818809843622148, + "D": 0.955845832824707 }, "sample": { "messages": [ @@ -24982,10 +24982,10 @@ ] }, "predict": { - "A": 0.0035457713529467583, - "B": 0.9831441044807434, - "C": 7.833628478692845e-05, - "D": 0.0008965113665908575 + "A": 0.0026421064976602793, + "B": 0.940654993057251, + "C": 3.3259209885727614e-05, + "D": 0.0005202615866437554 }, "sample": { "messages": [ @@ -25027,10 +25027,10 @@ ] }, "predict": { - "A": 0.9531280994415283, - "B": 0.0020849592983722687, - "C": 0.0009848650079220533, - "D": 0.0003623116062954068 + "A": 0.8511767983436584, + "B": 0.00020890409359708428, + "C": 0.00020890409359708428, + "D": 7.68515164963901e-05 }, "sample": { "messages": [ @@ -25072,10 +25072,10 @@ ] }, "predict": { - "A": 0.1010114997625351, - "B": 0.008291528560221195, - "C": 0.001353554893285036, - "D": 0.8457589149475098 + "A": 0.08853661268949509, + "B": 0.0018375188810750842, + "C": 0.0005604116013273597, + "D": 0.8400120139122009 }, "sample": { "messages": [ @@ -25117,10 +25117,10 @@ ] }, "predict": { - "A": 0.05175439640879631, - "B": 0.007004194892942905, - "C": 0.0017709379317238927, - "D": 0.9173685312271118 + "A": 0.02812693454325199, + "B": 0.0009624507511034608, + "C": 0.0007978997309692204, + "D": 0.9314361214637756 }, "sample": { "messages": [ @@ -25162,10 +25162,10 @@ ] }, "predict": { - "A": 0.9616552591323853, - "B": 0.0023837049957364798, - "C": 0.005046303384006023, - "D": 0.0009936760179698467 + "A": 0.9191225171089172, + "B": 0.002925365697592497, + "C": 0.01683431677520275, + "D": 0.0015658354386687279 }, "sample": { "messages": [ @@ -25200,17 +25200,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.24628794193267822, - "B": 0.46012642979621887, - "C": 0.14938117563724518, - "D": 0.07056267559528351 + "A": 0.11373019218444824, + "B": 0.14603246748447418, + "C": 0.6544721126556396, + "D": 0.006416219286620617 }, "sample": { "messages": [ @@ -25240,7 +25240,7 @@ "prompt_len": 98, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -25252,10 +25252,10 @@ ] }, "predict": { - "A": 0.00846884772181511, - "B": 0.9788656830787659, - "C": 0.00012080162559868768, - "D": 0.0002256871375720948 + "A": 0.0029909410513937473, + "B": 0.9397256970405579, + "C": 7.487673428840935e-05, + "D": 0.0002782008668873459 }, "sample": { "messages": [ @@ -25297,10 +25297,10 @@ ] }, "predict": { - "A": 0.0048804981634020805, - "B": 0.9300583004951477, - "C": 0.0029601717833429575, - "D": 0.036062274128198624 + "A": 0.0019444413483142853, + "B": 0.8888910412788391, + "C": 0.0010407844092696905, + "D": 0.050147805362939835 }, "sample": { "messages": [ @@ -25335,17 +25335,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.0077237398363649845, - "B": 0.787842869758606, - "C": 0.07328079640865326, - "D": 0.10662293434143066 + "A": 0.012055897153913975, + "B": 0.31092569231987, + "C": 0.5126297473907471, + "D": 0.11438316851854324 }, "sample": { "messages": [ @@ -25375,7 +25375,7 @@ "prompt_len": 72, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -25387,10 +25387,10 @@ ] }, "predict": { - "A": 0.2962389588356018, - "B": 0.08487387746572495, - "C": 0.02755451202392578, - "D": 0.5534471869468689 + "A": 0.34922829270362854, + "B": 0.0025048425886780024, + "C": 0.002666391199454665, + "D": 0.5757801532745361 }, "sample": { "messages": [ @@ -25432,10 +25432,10 @@ ] }, "predict": { - "A": 0.00042149052023887634, - "B": 0.9785207509994507, - "C": 0.0005412045284174383, - "D": 0.00025564691168256104 + "A": 0.0002720066113397479, + "B": 0.9188023805618286, + "C": 0.0003281025856267661, + "D": 0.0004773864638991654 }, "sample": { "messages": [ @@ -25477,10 +25477,10 @@ ] }, "predict": { - "A": 0.03549995645880699, - "B": 0.381660521030426, - "C": 0.5553127527236938, - "D": 0.011525148525834084 + "A": 0.016290955245494843, + "B": 0.06443198025226593, + "C": 0.8894559741020203, + "D": 0.0014234877889975905 }, "sample": { "messages": [ @@ -25522,10 +25522,10 @@ ] }, "predict": { - "A": 7.33275810489431e-05, - "B": 0.9796364307403564, - "C": 0.00010669100447557867, - "D": 0.0010122560197487473 + "A": 5.677043372998014e-05, + "B": 0.9148502945899963, + "C": 0.00011290149268461391, + "D": 0.0013754217652603984 }, "sample": { "messages": [ @@ -25567,10 +25567,10 @@ ] }, "predict": { - "A": 0.05867277458310127, - "B": 0.011553354561328888, - "C": 0.0005076189409010112, - "D": 0.9177966117858887 + "A": 0.017440807074308395, + "B": 0.0026746345683932304, + "C": 0.00018201115017291158, + "D": 0.9522358179092407 }, "sample": { "messages": [ @@ -25612,10 +25612,10 @@ ] }, "predict": { - "A": 0.000892451498657465, - "B": 0.9786918759346008, - "C": 7.32568878447637e-05, - "D": 0.00022564706159755588 + "A": 0.0010498159099370241, + "B": 0.8966045379638672, + "C": 5.563820741372183e-05, + "D": 0.00015124033961910754 }, "sample": { "messages": [ @@ -25657,10 +25657,10 @@ ] }, "predict": { - "A": 0.00508168013766408, - "B": 0.003492584452033043, - "C": 0.0021183595526963472, - "D": 0.9683968424797058 + "A": 0.0014424127293750644, + "B": 0.0004984845290891826, + "C": 0.0005306340171955526, + "D": 0.9594087600708008 }, "sample": { "messages": [ @@ -25702,10 +25702,10 @@ ] }, "predict": { - "A": 0.00013511687575373799, - "B": 0.9662135243415833, - "C": 4.970672307536006e-05, - "D": 0.02005312219262123 + "A": 0.0001795951829990372, + "B": 0.9395961165428162, + "C": 9.030613728100434e-05, + "D": 0.004930547904223204 }, "sample": { "messages": [ @@ -25747,10 +25747,10 @@ ] }, "predict": { - "A": 0.4573580026626587, - "B": 0.1906551718711853, - "C": 0.04254091903567314, - "D": 0.2774016559123993 + "A": 0.329983115196228, + "B": 0.22679385542869568, + "C": 0.06497751921415329, + "D": 0.29120907187461853 }, "sample": { "messages": [ @@ -25792,10 +25792,10 @@ ] }, "predict": { - "A": 0.000846692593768239, - "B": 0.000846692593768239, - "C": 0.9883949756622314, - "D": 0.0003315695794299245 + "A": 0.0004842505441047251, + "B": 0.00017814579769037664, + "C": 0.9921229481697083, + "D": 0.00010150441812584177 }, "sample": { "messages": [ @@ -25837,10 +25837,10 @@ ] }, "predict": { - "A": 0.006115430500358343, - "B": 0.0015462231822311878, - "C": 0.06574707478284836, - "D": 0.9076103568077087 + "A": 0.0012569129467010498, + "B": 0.00015979855379555374, + "C": 0.002832497004419565, + "D": 0.9473406076431274 }, "sample": { "messages": [ @@ -25875,17 +25875,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "D" ] }, "predict": { - "A": 0.021866777911782265, - "B": 0.6390409469604492, - "C": 0.01032913476228714, - "D": 0.30186158418655396 + "A": 0.02578607387840748, + "B": 0.24465149641036987, + "C": 0.003489765804260969, + "D": 0.6650316119194031 }, "sample": { "messages": [ @@ -25915,7 +25915,7 @@ "prompt_len": 78, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -25927,10 +25927,10 @@ ] }, "predict": { - "A": 0.0010238527320325375, - "B": 0.001912808744236827, - "C": 0.9908594489097595, - "D": 0.0001893937005661428 + "A": 0.0013151546008884907, + "B": 0.0004269682103767991, + "C": 0.9912376403808594, + "D": 3.730806565727107e-05 }, "sample": { "messages": [ @@ -25972,10 +25972,10 @@ ] }, "predict": { - "A": 0.0013010225957259536, - "B": 0.000509487756062299, - "C": 4.451854692888446e-05, - "D": 0.9805861711502075 + "A": 0.0006305920542217791, + "B": 0.00010294132516719401, + "C": 2.949318331957329e-05, + "D": 0.945206880569458 }, "sample": { "messages": [ @@ -26017,10 +26017,10 @@ ] }, "predict": { - "A": 0.656292200088501, - "B": 0.05387174338102341, - "C": 0.00344390538521111, - "D": 0.21306687593460083 + "A": 0.8549141883850098, + "B": 0.008921830914914608, + "C": 0.0015503816539421678, + "D": 0.03756231069564819 }, "sample": { "messages": [ @@ -26062,10 +26062,10 @@ ] }, "predict": { - "A": 0.0031066611409187317, - "B": 0.0021351748146116734, - "C": 0.001467482652515173, - "D": 0.9760838150978088 + "A": 0.0005179063882678747, + "B": 0.0011671201791614294, + "C": 0.0006650049472227693, + "D": 0.9363967180252075 }, "sample": { "messages": [ @@ -26107,10 +26107,10 @@ ] }, "predict": { - "A": 0.0005409820005297661, - "B": 0.9781184792518616, - "C": 7.321395969484001e-05, - "D": 0.00017563113942742348 + "A": 0.00031850641244091094, + "B": 0.8919297456741333, + "C": 7.106838165782392e-05, + "D": 8.053103374550119e-05 }, "sample": { "messages": [ @@ -26152,10 +26152,10 @@ ] }, "predict": { - "A": 0.8580715656280518, - "B": 0.005781640764325857, - "C": 0.09044007211923599, - "D": 0.00047458597691729665 + "A": 0.7270268201828003, + "B": 0.0006227957783266902, + "C": 0.0868309736251831, + "D": 0.00012263575627002865 }, "sample": { "messages": [ @@ -26197,10 +26197,10 @@ ] }, "predict": { - "A": 0.007441060151904821, - "B": 0.004513231571763754, - "C": 0.9745866060256958, - "D": 0.0031018955633044243 + "A": 0.0074948701076209545, + "B": 0.0011493755737319589, + "C": 0.9816343188285828, + "D": 0.0008409011643379927 }, "sample": { "messages": [ @@ -26242,10 +26242,10 @@ ] }, "predict": { - "A": 0.06251110136508942, - "B": 0.07083436101675034, - "C": 0.5930888056755066, - "D": 0.2472362071275711 + "A": 0.07048878818750381, + "B": 0.007908623665571213, + "C": 0.7578259706497192, + "D": 0.1316903978586197 }, "sample": { "messages": [ @@ -26287,10 +26287,10 @@ ] }, "predict": { - "A": 0.03211263194680214, - "B": 0.017188653349876404, - "C": 0.9384686350822449, - "D": 0.001245141844265163 + "A": 0.02253483049571514, + "B": 0.0030497577972710133, + "C": 0.9582054018974304, + "D": 0.0003421732981223613 }, "sample": { "messages": [ @@ -26332,10 +26332,10 @@ ] }, "predict": { - "A": 0.9611971974372864, - "B": 0.006476495414972305, - "C": 0.002102610422298312, - "D": 0.008315985091030598 + "A": 0.9416212439537048, + "B": 0.0009140254114754498, + "C": 0.0007577537326142192, + "D": 0.0029969741590321064 }, "sample": { "messages": [ @@ -26377,10 +26377,10 @@ ] }, "predict": { - "A": 0.0899554193019867, - "B": 0.02274429239332676, - "C": 0.8534733057022095, - "D": 0.0012831451604142785 + "A": 0.05695373937487602, + "B": 0.009897076524794102, + "C": 0.8909063935279846, + "D": 0.000864796806126833 }, "sample": { "messages": [ @@ -26422,10 +26422,10 @@ ] }, "predict": { - "A": 0.10606063157320023, - "B": 0.2544262409210205, - "C": 0.5386203527450562, - "D": 0.06432902812957764 + "A": 0.10373881459236145, + "B": 0.1938096135854721, + "C": 0.464925080537796, + "D": 0.17103639245033264 }, "sample": { "messages": [ @@ -26467,10 +26467,10 @@ ] }, "predict": { - "A": 0.02748284675180912, - "B": 0.011456554755568504, - "C": 0.910106897354126, - "D": 0.035288672894239426 + "A": 0.02236286550760269, + "B": 0.0030264847446233034, + "C": 0.950893223285675, + "D": 0.006407068111002445 }, "sample": { "messages": [ @@ -26512,10 +26512,10 @@ ] }, "predict": { - "A": 0.005015392322093248, - "B": 0.007297351956367493, - "C": 0.01983626000583172, - "D": 0.9557645916938782 + "A": 0.010569315403699875, + "B": 0.004690105561167002, + "C": 0.2560711205005646, + "D": 0.6960734128952026 }, "sample": { "messages": [ @@ -26557,10 +26557,10 @@ ] }, "predict": { - "A": 0.9521604776382446, - "B": 0.003891262225806713, - "C": 0.0007662349962629378, - "D": 0.0005967443576082587 + "A": 0.9140061736106873, + "B": 0.00044612211058847606, + "C": 0.00014483463019132614, + "D": 0.00021073313837405294 }, "sample": { "messages": [ @@ -26602,10 +26602,10 @@ ] }, "predict": { - "A": 0.006548715755343437, - "B": 0.004500862210988998, - "C": 0.2168636918067932, - "D": 0.7569286227226257 + "A": 0.001807664753869176, + "B": 0.0005513065843842924, + "C": 0.03204162418842316, + "D": 0.9363934993743896 }, "sample": { "messages": [ @@ -26647,10 +26647,10 @@ ] }, "predict": { - "A": 0.005151357501745224, - "B": 0.005151357501745224, - "C": 0.0013024670770391822, - "D": 0.9816749691963196 + "A": 0.00201009726151824, + "B": 0.0005759025225415826, + "C": 0.0005759025225415826, + "D": 0.9781696796417236 }, "sample": { "messages": [ @@ -26692,10 +26692,10 @@ ] }, "predict": { - "A": 0.8997792601585388, - "B": 0.001532881986349821, - "C": 0.0003420321736484766, - "D": 0.0008204925106838346 + "A": 0.7965685725212097, + "B": 0.0002844531263690442, + "C": 0.00022153231839183718, + "D": 0.0005314284353516996 }, "sample": { "messages": [ @@ -26737,10 +26737,10 @@ ] }, "predict": { - "A": 0.046454090625047684, - "B": 0.9330553412437439, - "C": 0.0020410502329468727, - "D": 0.0029697108548134565 + "A": 0.056096822023391724, + "B": 0.8775019645690918, + "C": 0.005912561435252428, + "D": 0.0024647226091474295 }, "sample": { "messages": [ @@ -26778,14 +26778,14 @@ "acc": false, "f1_macro": [ "D", - "A" + "C" ] }, "predict": { - "A": 0.3994327187538147, - "B": 0.02553487941622734, - "C": 0.12967681884765625, - "D": 0.3994327187538147 + "A": 0.11631440371274948, + "B": 0.017837392166256905, + "C": 0.4600324034690857, + "D": 0.35827359557151794 }, "sample": { "messages": [ @@ -26815,7 +26815,7 @@ "prompt_len": 72, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -26827,10 +26827,10 @@ ] }, "predict": { - "A": 0.02251550927758217, - "B": 0.005023888777941465, - "C": 0.004433566704392433, - "D": 0.9573837518692017 + "A": 0.01918015629053116, + "B": 0.004555686376988888, + "C": 0.007055984809994698, + "D": 0.9241516590118408 }, "sample": { "messages": [ @@ -26872,10 +26872,10 @@ ] }, "predict": { - "A": 0.0011453245533630252, - "B": 0.9781744480133057, - "C": 0.00025555642787367105, - "D": 0.009589685127139091 + "A": 0.0005681996699422598, + "B": 0.9650864005088806, + "C": 9.275604679714888e-05, + "D": 0.006502700969576836 }, "sample": { "messages": [ @@ -26917,10 +26917,10 @@ ] }, "predict": { - "A": 0.969639241695404, - "B": 0.003962694201618433, - "C": 0.0001972909230971709, - "D": 0.0016518967458978295 + "A": 0.9020127058029175, + "B": 0.001356121152639389, + "C": 0.00011131721112178639, + "D": 0.0004988891305401921 }, "sample": { "messages": [ @@ -26962,10 +26962,10 @@ ] }, "predict": { - "A": 0.006598529871553183, - "B": 0.0031169247813522816, - "C": 0.9793086647987366, - "D": 0.00018718586943577975 + "A": 0.006583486218005419, + "B": 0.00039536907570436597, + "C": 0.97707599401474, + "D": 5.350738865672611e-05 }, "sample": { "messages": [ @@ -27000,17 +27000,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.0747549831867218, - "B": 0.6259158253669739, - "C": 0.26092052459716797, - "D": 0.008928208611905575 + "A": 0.44027119874954224, + "B": 0.2670379877090454, + "C": 0.23566021025180817, + "D": 0.006685165222734213 }, "sample": { "messages": [ @@ -27040,7 +27040,7 @@ "prompt_len": 85, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -27052,10 +27052,10 @@ ] }, "predict": { - "A": 0.002539781155064702, - "B": 0.008327621966600418, - "C": 0.00017283426132053137, - "D": 0.9625421762466431 + "A": 0.0007087907870300114, + "B": 0.0005876081413589418, + "C": 5.465607682708651e-05, + "D": 0.9375828504562378 }, "sample": { "messages": [ @@ -27097,10 +27097,10 @@ ] }, "predict": { - "A": 0.016201650723814964, - "B": 0.03886574134230614, - "C": 0.018358875066041946, - "D": 0.884580135345459 + "A": 0.006898744963109493, + "B": 0.010037615895271301, + "C": 0.011374108493328094, + "D": 0.903557300567627 }, "sample": { "messages": [ @@ -27142,10 +27142,10 @@ ] }, "predict": { - "A": 0.1439272165298462, - "B": 0.8282440304756165, - "C": 0.008119819685816765, - "D": 0.003835531184449792 + "A": 0.008128213696181774, + "B": 0.939493715763092, + "C": 0.008128213696181774, + "D": 0.0009707766585052013 }, "sample": { "messages": [ @@ -27187,10 +27187,10 @@ ] }, "predict": { - "A": 0.0577961690723896, - "B": 0.021262023597955704, - "C": 0.9040842652320862, - "D": 0.0011268460657447577 + "A": 0.025424774736166, + "B": 0.003899015486240387, + "C": 0.9540576338768005, + "D": 0.0003406921459827572 }, "sample": { "messages": [ @@ -27232,10 +27232,10 @@ ] }, "predict": { - "A": 0.966993510723114, - "B": 0.0009991921251639724, - "C": 0.0004165252612438053, - "D": 0.0002862733672372997 + "A": 0.8924373984336853, + "B": 0.00017058123194146901, + "C": 0.00029937937506474555, + "D": 0.00019329384667798877 }, "sample": { "messages": [ @@ -27277,10 +27277,10 @@ ] }, "predict": { - "A": 0.004910894203931093, - "B": 0.9358507990837097, - "C": 0.009174758568406105, - "D": 0.022009100764989853 + "A": 0.005143145099282265, + "B": 0.8649440407752991, + "C": 0.017951341345906258, + "D": 0.0035348287783563137 }, "sample": { "messages": [ @@ -27322,10 +27322,10 @@ ] }, "predict": { - "A": 0.04244031384587288, - "B": 0.05449444428086281, - "C": 0.004473176319152117, - "D": 0.8524365425109863 + "A": 0.009056505747139454, + "B": 0.00549304811283946, + "C": 0.00108164525590837, + "D": 0.9237886667251587 }, "sample": { "messages": [ @@ -27367,10 +27367,10 @@ ] }, "predict": { - "A": 0.001887824502773583, - "B": 0.005814908538013697, - "C": 0.0010104796383529902, - "D": 0.9779173135757446 + "A": 0.0012049659853801131, + "B": 0.0012049659853801131, + "C": 0.0007308487547561526, + "D": 0.9667609333992004 }, "sample": { "messages": [ @@ -27405,17 +27405,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "A" + "C" ] }, "predict": { - "A": 0.6291279196739197, - "B": 0.0069889803417027, - "C": 0.2622595429420471, - "D": 0.008974027819931507 + "A": 0.3510152995586395, + "B": 0.00126595888286829, + "C": 0.5107242465019226, + "D": 0.001527037937194109 }, "sample": { "messages": [ @@ -27445,7 +27445,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -27457,10 +27457,10 @@ ] }, "predict": { - "A": 0.009477974846959114, - "B": 0.009477974846959114, - "C": 0.9667796492576599, - "D": 0.0006449842476285994 + "A": 0.0021541453897953033, + "B": 0.0008979814010672271, + "C": 0.9847561717033386, + "D": 0.0001661098503973335 }, "sample": { "messages": [ @@ -27502,10 +27502,10 @@ ] }, "predict": { - "A": 0.0836048573255539, - "B": 0.0032417122274637222, - "C": 0.8988369703292847, - "D": 0.0011203058529645205 + "A": 0.04142490401864052, + "B": 0.0003366815217304975, + "C": 0.9428264498710632, + "D": 0.00027911883080378175 }, "sample": { "messages": [ @@ -27547,10 +27547,10 @@ ] }, "predict": { - "A": 0.011428155936300755, - "B": 0.011428155936300755, - "C": 0.801175594329834, - "D": 0.15776081383228302 + "A": 0.0029181414283812046, + "B": 0.002134957816451788, + "C": 0.9759846925735474, + "D": 0.010842211544513702 }, "sample": { "messages": [ @@ -27592,10 +27592,10 @@ ] }, "predict": { - "A": 0.014924616552889347, - "B": 0.024606533348560333, - "C": 0.923353374004364, - "D": 0.0217151902616024 + "A": 0.030875863507390022, + "B": 0.024046147242188454, + "C": 0.9023250341415405, + "D": 0.027247855439782143 }, "sample": { "messages": [ @@ -27637,10 +27637,10 @@ ] }, "predict": { - "A": 0.0253840833902359, - "B": 0.11376357078552246, - "C": 0.8406053781509399, - "D": 0.0056639546528458595 + "A": 0.003519452176988125, + "B": 0.00956686306744814, + "C": 0.975846529006958, + "D": 0.0012947340728715062 }, "sample": { "messages": [ @@ -27682,10 +27682,10 @@ ] }, "predict": { - "A": 0.028905170038342476, - "B": 0.9572077393531799, - "C": 0.0006797844544053078, - "D": 0.0007702966686338186 + "A": 0.013544623740017414, + "B": 0.9495514631271362, + "C": 0.0003609520208556205, + "D": 0.0009811692871153355 }, "sample": { "messages": [ @@ -27727,10 +27727,10 @@ ] }, "predict": { - "A": 0.001153950928710401, - "B": 0.008526608347892761, - "C": 0.9855419397354126, - "D": 7.852734415791929e-05 + "A": 0.0005494629149325192, + "B": 0.0007055244059301913, + "C": 0.9934523105621338, + "D": 2.1304989786585793e-05 }, "sample": { "messages": [ @@ -27772,10 +27772,10 @@ ] }, "predict": { - "A": 0.0031510682310909033, - "B": 0.0010230019688606262, - "C": 0.990036129951477, - "D": 7.410602120216936e-05 + "A": 0.0031309921760112047, + "B": 0.00021306668350007385, + "C": 0.9837284684181213, + "D": 3.4782180591719225e-05 }, "sample": { "messages": [ @@ -27817,10 +27817,10 @@ ] }, "predict": { - "A": 0.9279510378837585, - "B": 0.0015808759490028024, - "C": 0.00033136954880319536, - "D": 0.0002924326399806887 + "A": 0.8513317704200745, + "B": 0.00014360366913024336, + "C": 5.9862919442821294e-05, + "D": 0.00011905162682523951 }, "sample": { "messages": [ @@ -27862,10 +27862,10 @@ ] }, "predict": { - "A": 0.003935568500310183, - "B": 0.00306502403691411, - "C": 0.0034731270279735327, - "D": 0.9630018472671509 + "A": 0.0009093216503970325, + "B": 0.00013944899546913803, + "C": 0.0007538541685789824, + "D": 0.9367755055427551 }, "sample": { "messages": [ @@ -27907,10 +27907,10 @@ ] }, "predict": { - "A": 0.9432754516601562, - "B": 0.025137439370155334, - "C": 0.00035856632166542113, - "D": 0.000591175863519311 + "A": 0.9550600051879883, + "B": 0.002367357024922967, + "C": 9.179239714285359e-05, + "D": 0.00014217084390111268 }, "sample": { "messages": [ @@ -27952,10 +27952,10 @@ ] }, "predict": { - "A": 0.003545190207660198, - "B": 0.21933314204216003, - "C": 0.001477855141274631, - "D": 0.7655478715896606 + "A": 0.0028994421008974314, + "B": 0.43031537532806396, + "C": 0.00044464386883191764, + "D": 0.552535891532898 }, "sample": { "messages": [ @@ -27997,10 +27997,10 @@ ] }, "predict": { - "A": 0.005840797442942858, - "B": 0.002758997492492199, - "C": 0.9822710752487183, - "D": 0.0008957153186202049 + "A": 0.0015870286151766777, + "B": 0.0002433787303743884, + "C": 0.9916433095932007, + "D": 0.00016727158799767494 }, "sample": { "messages": [ @@ -28042,10 +28042,10 @@ ] }, "predict": { - "A": 0.9765053987503052, - "B": 0.0003275810449849814, - "C": 2.0941557522746734e-05, - "D": 7.309322245419025e-05 + "A": 0.8668915629386902, + "B": 5.726382732973434e-05, + "C": 5.32636613570503e-06, + "D": 3.473226388450712e-05 }, "sample": { "messages": [ @@ -28087,10 +28087,10 @@ ] }, "predict": { - "A": 0.004565463867038488, - "B": 0.003137794090434909, - "C": 0.985865592956543, - "D": 0.0007933586020953953 + "A": 0.0012326196301728487, + "B": 0.00018902831652667373, + "C": 0.988947868347168, + "D": 0.00045345540274865925 }, "sample": { "messages": [ @@ -28132,10 +28132,10 @@ ] }, "predict": { - "A": 0.967495322227478, - "B": 0.009484990499913692, - "C": 0.0007785754278302193, - "D": 0.0005351065192371607 + "A": 0.945670485496521, + "B": 0.0004913464654237032, + "C": 0.00010963421664200723, + "D": 0.00010299181303707883 }, "sample": { "messages": [ @@ -28177,10 +28177,10 @@ ] }, "predict": { - "A": 0.14398197829723358, - "B": 0.0015994955319911242, - "C": 0.0008042767876759171, - "D": 0.828559160232544 + "A": 0.19304361939430237, + "B": 0.00016536768816877156, + "C": 0.00024060859868768603, + "D": 0.7635023593902588 }, "sample": { "messages": [ @@ -28222,10 +28222,10 @@ ] }, "predict": { - "A": 0.013882508501410484, - "B": 0.0016580293886363506, - "C": 0.973239004611969, - "D": 0.0030976065900176764 + "A": 0.004569971933960915, + "B": 0.00017719702736940235, + "C": 0.9868391156196594, + "D": 0.0007941420190036297 }, "sample": { "messages": [ @@ -28267,10 +28267,10 @@ ] }, "predict": { - "A": 0.9469655156135559, - "B": 0.003415290964767337, - "C": 0.0006725106504745781, - "D": 0.0003599690389819443 + "A": 0.8508415818214417, + "B": 0.00039013047353364527, + "C": 0.00034428894286975265, + "D": 0.0001527772838016972 }, "sample": { "messages": [ @@ -28312,10 +28312,10 @@ ] }, "predict": { - "A": 0.017641909420490265, - "B": 0.9632155895233154, - "C": 0.0007751313969492912, - "D": 0.003065704135224223 + "A": 0.019097834825515747, + "B": 0.9201852083206177, + "C": 0.0008932175696827471, + "D": 0.006200158968567848 }, "sample": { "messages": [ @@ -28357,10 +28357,10 @@ ] }, "predict": { - "A": 0.9519920349121094, - "B": 0.003029982093721628, - "C": 0.00015085391351021826, - "D": 0.00028183223912492394 + "A": 0.88003009557724, + "B": 0.0003142570785712451, + "C": 7.012022979324684e-05, + "D": 0.00010860434122150764 }, "sample": { "messages": [ @@ -28402,10 +28402,10 @@ ] }, "predict": { - "A": 0.0009069751831702888, - "B": 0.00025985270622186363, - "C": 0.9946190118789673, - "D": 0.0003780835249926895 + "A": 0.0007048026891425252, + "B": 0.00010153645416721702, + "C": 0.9924361109733582, + "D": 0.00016740532009862363 }, "sample": { "messages": [ @@ -28447,10 +28447,10 @@ ] }, "predict": { - "A": 0.06156023591756821, - "B": 0.1476753205060959, - "C": 0.7499572038650513, - "D": 0.015564864501357079 + "A": 0.03102954477071762, + "B": 0.021326275542378426, + "C": 0.9068162441253662, + "D": 0.018820369616150856 }, "sample": { "messages": [ @@ -28492,10 +28492,10 @@ ] }, "predict": { - "A": 0.9451906085014343, - "B": 0.010500119999051094, - "C": 0.005620308220386505, - "D": 0.0026548460591584444 + "A": 0.8835717439651489, + "B": 0.004636559169739485, + "C": 0.0028122153598815203, + "D": 0.0020574606023728848 }, "sample": { "messages": [ @@ -28530,17 +28530,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.16141386330127716, - "B": 0.8197273015975952, - "C": 0.0008470222819596529, - "D": 0.005523279309272766 + "A": 0.6700130105018616, + "B": 0.24648404121398926, + "C": 0.0012934294063597918, + "D": 0.0027381901163607836 }, "sample": { "messages": [ @@ -28570,7 +28570,7 @@ "prompt_len": 86, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -28582,10 +28582,10 @@ ] }, "predict": { - "A": 0.9555239677429199, - "B": 0.01544460654258728, - "C": 0.000987340696156025, - "D": 0.0020902003161609173 + "A": 0.9376792311668396, + "B": 0.0009689017315395176, + "C": 0.00021619119797833264, + "D": 0.0006659157807007432 }, "sample": { "messages": [ @@ -28627,10 +28627,10 @@ ] }, "predict": { - "A": 0.00453372485935688, - "B": 0.8639749884605408, - "C": 0.10318709909915924, - "D": 0.0035308683291077614 + "A": 0.0030346778221428394, + "B": 0.7425611615180969, + "C": 0.16568779945373535, + "D": 0.001265041995793581 }, "sample": { "messages": [ @@ -28672,10 +28672,10 @@ ] }, "predict": { - "A": 0.04857375845313072, - "B": 0.0623699389398098, - "C": 0.8609904050827026, - "D": 0.012281368486583233 + "A": 0.008322962559759617, + "B": 0.013722245581448078, + "C": 0.9620037078857422, + "D": 0.006481930147856474 }, "sample": { "messages": [ @@ -28717,10 +28717,10 @@ ] }, "predict": { - "A": 0.0007923775119706988, - "B": 0.9846464991569519, - "C": 0.0003303120902273804, - "D": 5.7399633078603074e-05 + "A": 0.0011986854951828718, + "B": 0.9034543633460999, + "C": 0.00030307515407912433, + "D": 6.352800846798345e-05 }, "sample": { "messages": [ @@ -28762,10 +28762,10 @@ ] }, "predict": { - "A": 0.012343976646661758, - "B": 0.0013010455295443535, - "C": 0.9806035161018372, - "D": 0.001670575700700283 + "A": 0.004581234883517027, + "B": 0.00020128539472352713, + "C": 0.9892711639404297, + "D": 0.0004003038047812879 }, "sample": { "messages": [ @@ -28807,10 +28807,10 @@ ] }, "predict": { - "A": 0.927869975566864, - "B": 0.010307705029845238, - "C": 0.0055173165164887905, - "D": 0.021821411326527596 + "A": 0.819586992263794, + "B": 0.004873441997915506, + "C": 0.004873441997915506, + "D": 0.007090816739946604 }, "sample": { "messages": [ @@ -28852,10 +28852,10 @@ ] }, "predict": { - "A": 0.09737671911716461, - "B": 0.6349759101867676, - "C": 0.18192365765571594, - "D": 0.02462068758904934 + "A": 0.07680021971464157, + "B": 0.7286602258682251, + "C": 0.05278396978974342, + "D": 0.010393778793513775 }, "sample": { "messages": [ @@ -28897,10 +28897,10 @@ ] }, "predict": { - "A": 0.02192189358174801, - "B": 0.4403129816055298, - "C": 0.49893999099731445, - "D": 0.0029668053612113 + "A": 0.005962664727121592, + "B": 0.08231204748153687, + "C": 0.8849379420280457, + "D": 0.00045979133574292064 }, "sample": { "messages": [ @@ -28935,17 +28935,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "A" ] }, "predict": { - "A": 0.33346354961395264, - "B": 0.008886502124369144, - "C": 0.6229919195175171, - "D": 0.0005013421759940684 + "A": 0.71197110414505, + "B": 0.0029096631333231926, + "C": 0.12372201681137085, + "D": 0.00017473887419328094 }, "sample": { "messages": [ @@ -28975,7 +28975,7 @@ "prompt_len": 93, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -28987,10 +28987,10 @@ ] }, "predict": { - "A": 0.4757823348045349, - "B": 0.009874546900391579, - "C": 0.37053966522216797, - "D": 0.08267857134342194 + "A": 0.6399253606796265, + "B": 0.006273607723414898, + "C": 0.07642818242311478, + "D": 0.09813573211431503 }, "sample": { "messages": [ @@ -29032,10 +29032,10 @@ ] }, "predict": { - "A": 0.001151296659372747, - "B": 0.9832749962806702, - "C": 6.495170964626595e-05, - "D": 0.00025688897585496306 + "A": 0.0013817385770380497, + "B": 0.9190518260002136, + "C": 4.44159159087576e-05, + "D": 0.00015502677706535906 }, "sample": { "messages": [ @@ -29077,10 +29077,10 @@ ] }, "predict": { - "A": 0.06907187402248383, - "B": 0.35077595710754395, - "C": 0.022424355149269104, - "D": 0.5103759765625 + "A": 0.03035200573503971, + "B": 0.03035200573503971, + "C": 0.003199077909812331, + "D": 0.8870156407356262 }, "sample": { "messages": [ @@ -29122,10 +29122,10 @@ ] }, "predict": { - "A": 0.9561970829963684, - "B": 0.006442805286496878, - "C": 0.0023701756726950407, - "D": 0.0044280714355409145 + "A": 0.9025464653968811, + "B": 0.0008230158709920943, + "C": 0.0004138383665122092, + "D": 0.0004991843597963452 }, "sample": { "messages": [ @@ -29167,10 +29167,10 @@ ] }, "predict": { - "A": 0.023817557841539383, - "B": 0.5420851111412048, - "C": 0.3725692927837372, - "D": 0.030582351610064507 + "A": 0.02809632197022438, + "B": 0.5643297433853149, + "C": 0.3422832787036896, + "D": 0.019310301169753075 }, "sample": { "messages": [ @@ -29212,10 +29212,10 @@ ] }, "predict": { - "A": 0.9511402249336243, - "B": 0.010566214099526405, - "C": 0.002671557478606701, - "D": 0.0011136706452816725 + "A": 0.9023106694221497, + "B": 0.0010564971016719937, + "C": 0.000682125857565552, + "D": 0.0001835916773416102 }, "sample": { "messages": [ @@ -29257,10 +29257,10 @@ ] }, "predict": { - "A": 0.003063071519136429, - "B": 0.005050151143223047, - "C": 0.0016395441489294171, - "D": 0.9623884558677673 + "A": 0.0019226039294153452, + "B": 0.0008531503845006227, + "C": 0.0009081737953238189, + "D": 0.9355929493904114 }, "sample": { "messages": [ @@ -29295,17 +29295,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "D" ] }, "predict": { - "A": 0.19257581233978271, - "B": 0.40768298506736755, - "C": 0.13235528767108917, - "D": 0.21821698546409607 + "A": 0.16443420946598053, + "B": 0.27110618352890015, + "C": 0.12806148827075958, + "D": 0.3072035312652588 }, "sample": { "messages": [ @@ -29335,7 +29335,7 @@ "prompt_len": 65, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -29347,10 +29347,10 @@ ] }, "predict": { - "A": 0.10478319972753525, - "B": 0.004603852517902851, - "C": 0.8773389458656311, - "D": 0.0027923777233809233 + "A": 0.09315642714500427, + "B": 0.0017062195111066103, + "C": 0.8838436007499695, + "D": 0.0006276830681599677 }, "sample": { "messages": [ @@ -29392,10 +29392,10 @@ ] }, "predict": { - "A": 0.03613734990358353, - "B": 0.013294187374413013, - "C": 0.9319945573806763, - "D": 0.004890658427029848 + "A": 0.052238572388887405, + "B": 0.0026008053682744503, + "C": 0.925950825214386, + "D": 0.0012285334523767233 }, "sample": { "messages": [ @@ -29437,10 +29437,10 @@ ] }, "predict": { - "A": 0.9426115155220032, - "B": 0.005604972597211599, - "C": 0.0007585505954921246, - "D": 0.015235896222293377 + "A": 0.8998063802719116, + "B": 0.0005639326409436762, + "C": 0.0002835631894413382, + "D": 0.0036773032043129206 }, "sample": { "messages": [ @@ -29482,10 +29482,10 @@ ] }, "predict": { - "A": 0.003973145503550768, - "B": 0.0065506091341376305, - "C": 0.0016562534729018807, - "D": 0.9721965789794922 + "A": 0.001355583779513836, + "B": 0.0007723883609287441, + "C": 0.00015209226694423705, + "D": 0.9598069190979004 }, "sample": { "messages": [ @@ -29527,10 +29527,10 @@ ] }, "predict": { - "A": 0.0063153584487736225, - "B": 0.036342378705739975, - "C": 0.002802423434332013, - "D": 0.9372822642326355 + "A": 0.004336124751716852, + "B": 0.02202065847814083, + "C": 0.003172376425936818, + "D": 0.936342179775238 }, "sample": { "messages": [ @@ -29572,10 +29572,10 @@ ] }, "predict": { - "A": 0.929020345211029, - "B": 0.017015602439641953, - "C": 0.0008471569162793458, - "D": 0.000747613376006484 + "A": 0.8502549529075623, + "B": 0.0007753322133794427, + "C": 0.0001625184522708878, + "D": 7.211712363641709e-05 }, "sample": { "messages": [ @@ -29617,10 +29617,10 @@ ] }, "predict": { - "A": 0.06956735998392105, - "B": 0.061392977833747864, - "C": 0.8475039005279541, - "D": 0.003463554894551635 + "A": 0.13089093565940857, + "B": 0.07938937097787857, + "C": 0.7532253861427307, + "D": 0.003276790725067258 }, "sample": { "messages": [ @@ -29662,10 +29662,10 @@ ] }, "predict": { - "A": 0.05096757784485817, - "B": 0.021246446296572685, - "C": 0.008856836706399918, - "D": 0.9034218788146973 + "A": 0.0245286263525486, + "B": 0.014877364039421082, + "C": 0.005141479894518852, + "D": 0.9204299449920654 }, "sample": { "messages": [ @@ -29707,10 +29707,10 @@ ] }, "predict": { - "A": 0.02205023169517517, - "B": 0.9375997185707092, - "C": 0.006317497231066227, - "D": 0.0033815125934779644 + "A": 0.030581995844841003, + "B": 0.8937369585037231, + "C": 0.0028445690404623747, + "D": 0.0015225880779325962 }, "sample": { "messages": [ @@ -29752,10 +29752,10 @@ ] }, "predict": { - "A": 0.01553665567189455, - "B": 0.9612188935279846, - "C": 0.000602421467192471, - "D": 0.008316172286868095 + "A": 0.00876590795814991, + "B": 0.8941468596458435, + "C": 0.00038514711195603013, + "D": 0.004140722099691629 }, "sample": { "messages": [ @@ -29790,17 +29790,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "B", - "C" + "B" ] }, "predict": { - "A": 0.089509978890419, - "B": 0.21472327411174774, - "C": 0.6613942384719849, - "D": 0.009434281848371029 + "A": 0.03036325052380562, + "B": 0.5382014513015747, + "C": 0.369900107383728, + "D": 0.0052763414569199085 }, "sample": { "messages": [ @@ -29830,7 +29830,7 @@ "prompt_len": 84, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " B" } } { @@ -29842,10 +29842,10 @@ ] }, "predict": { - "A": 0.00028712773928418756, - "B": 0.9698795080184937, - "C": 0.00047339359298348427, - "D": 0.0002236152795376256 + "A": 0.00014054225175641477, + "B": 0.8331826329231262, + "C": 0.00033714334131218493, + "D": 0.00023171500652097166 }, "sample": { "messages": [ @@ -29887,10 +29887,10 @@ ] }, "predict": { - "A": 0.08063538372516632, - "B": 0.00963052362203598, - "C": 0.7650471925735474, - "D": 0.132945254445076 + "A": 0.048863593488931656, + "B": 0.0007898064213804901, + "C": 0.8661279082298279, + "D": 0.07109610736370087 }, "sample": { "messages": [ @@ -29932,10 +29932,10 @@ ] }, "predict": { - "A": 0.000479917973279953, - "B": 0.9832465052604675, - "C": 8.339722990058362e-05, - "D": 0.009639410302042961 + "A": 0.0008599945576861501, + "B": 0.9430985450744629, + "C": 0.00026228351634927094, + "D": 0.022179553285241127 }, "sample": { "messages": [ @@ -29970,17 +29970,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.24035118520259857, - "B": 0.7403337955474854, - "C": 0.0004094670293852687, - "D": 0.0006750970496796072 + "A": 0.671310305595398, + "B": 0.2469612956047058, + "C": 0.0002891619224101305, + "D": 0.0004207280871924013 }, "sample": { "messages": [ @@ -30010,7 +30010,7 @@ "prompt_len": 83, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -30022,10 +30022,10 @@ ] }, "predict": { - "A": 0.000889766204636544, - "B": 0.9757471084594727, - "C": 9.982899791793898e-05, - "D": 0.0005396704655140638 + "A": 0.0010017092572525144, + "B": 0.9106947779655457, + "C": 8.75283803907223e-05, + "D": 0.0003922749892808497 }, "sample": { "messages": [ @@ -30067,10 +30067,10 @@ ] }, "predict": { - "A": 0.97016441822052, - "B": 0.0010024685179814696, - "C": 3.22242958645802e-05, - "D": 0.0001197277961182408 + "A": 0.9235875010490417, + "B": 8.33893718663603e-05, + "C": 8.256665751105174e-06, + "D": 3.476185884210281e-05 }, "sample": { "messages": [ @@ -30105,17 +30105,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "A" + "D" ] }, "predict": { - "A": 0.8703052997589111, - "B": 0.008532173000276089, - "C": 0.0021572711411863565, - "D": 0.08095099776983261 + "A": 0.29844701290130615, + "B": 0.0018890845822170377, + "C": 0.002582072513177991, + "D": 0.631812334060669 }, "sample": { "messages": [ @@ -30145,7 +30145,7 @@ "prompt_len": 88, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -30157,10 +30157,10 @@ ] }, "predict": { - "A": 0.01741129532456398, - "B": 0.011966596357524395, - "C": 0.0020794826559722424, - "D": 0.9506244659423828 + "A": 0.019680486992001534, + "B": 0.0034199559595435858, + "C": 0.0012581314658746123, + "D": 0.9482589960098267 }, "sample": { "messages": [ @@ -30202,10 +30202,10 @@ ] }, "predict": { - "A": 0.001160584855824709, - "B": 0.00022853272093925625, - "C": 0.9912077188491821, - "D": 0.0004838038294110447 + "A": 0.0011540668783709407, + "B": 0.00013783364556729794, + "C": 0.9856410026550293, + "D": 0.00024190559634007514 }, "sample": { "messages": [ @@ -30247,10 +30247,10 @@ ] }, "predict": { - "A": 0.9657908082008362, - "B": 0.003073900705203414, - "C": 0.0005341637879610062, - "D": 0.0005341637879610062 + "A": 0.8923818469047546, + "B": 0.0003843868325930089, + "C": 0.0002641849569045007, + "D": 0.0001602362608537078 }, "sample": { "messages": [ @@ -30292,10 +30292,10 @@ ] }, "predict": { - "A": 0.5603888630867004, - "B": 0.20615555346012115, - "C": 0.18193161487579346, - "D": 0.010263879783451557 + "A": 0.64970463514328, + "B": 0.06043194234371185, + "C": 0.1642710566520691, + "D": 0.0087060471996665 }, "sample": { "messages": [ @@ -30337,10 +30337,10 @@ ] }, "predict": { - "A": 0.0011234793346375227, - "B": 0.9595174193382263, - "C": 0.00022122620430309325, - "D": 0.0002506821183487773 + "A": 0.0014997349353507161, + "B": 0.8803225159645081, + "C": 0.00021605730580631644, + "D": 0.00017911777831614017 }, "sample": { "messages": [ @@ -30382,10 +30382,10 @@ ] }, "predict": { - "A": 0.127217635512352, - "B": 0.025050636380910873, - "C": 0.8295631408691406, - "D": 0.00408940389752388 + "A": 0.35421502590179443, + "B": 0.003472601529210806, + "C": 0.5840018391609192, + "D": 0.0005668866215273738 }, "sample": { "messages": [ @@ -30427,10 +30427,10 @@ ] }, "predict": { - "A": 0.720893919467926, - "B": 0.0522213838994503, - "C": 0.11055266857147217, - "D": 0.08609850704669952 + "A": 0.6396507620811462, + "B": 0.008571325801312923, + "C": 0.20766420662403107, + "D": 0.09809362888336182 }, "sample": { "messages": [ @@ -30472,10 +30472,10 @@ ] }, "predict": { - "A": 0.6919283270835876, - "B": 0.016272595152258873, - "C": 0.03903592750430107, - "D": 0.19824078679084778 + "A": 0.8405992984771729, + "B": 0.0020836375188082457, + "C": 0.013587022200226784, + "D": 0.025383900851011276 }, "sample": { "messages": [ @@ -30517,10 +30517,10 @@ ] }, "predict": { - "A": 0.001582279335707426, - "B": 0.046240974217653275, - "C": 0.0020316867157816887, - "D": 0.9287748336791992 + "A": 0.00038621333078481257, + "B": 0.0064310296438634396, + "C": 0.0007215415243990719, + "D": 0.9544494152069092 }, "sample": { "messages": [ @@ -30562,10 +30562,10 @@ ] }, "predict": { - "A": 0.9597984552383423, - "B": 0.0016351318918168545, - "C": 0.00036484721931628883, - "D": 0.00015209092816803604 + "A": 0.8506119847297668, + "B": 0.00039002520497888327, + "C": 0.0004151797038502991, + "D": 0.00010497385665075853 }, "sample": { "messages": [ @@ -30607,10 +30607,10 @@ ] }, "predict": { - "A": 0.007462889421731234, - "B": 0.0025791057851165533, - "C": 0.977445662021637, - "D": 0.0004481813812162727 + "A": 0.003554385621100664, + "B": 0.0002272242563776672, + "C": 0.9855325818061829, + "D": 5.745128873968497e-05 }, "sample": { "messages": [ @@ -30652,10 +30652,10 @@ ] }, "predict": { - "A": 0.8801134824752808, - "B": 0.056263770908117294, - "C": 0.000802559603471309, - "D": 0.0021815833169966936 + "A": 0.8632414937019348, + "B": 0.003113334998488426, + "C": 9.40145764616318e-05, + "D": 0.00032814309815876186 }, "sample": { "messages": [ @@ -30697,10 +30697,10 @@ ] }, "predict": { - "A": 0.0814008042216301, - "B": 0.02058134786784649, - "C": 0.8751412034034729, - "D": 0.0045923194848001 + "A": 0.04574970155954361, + "B": 0.005464022513478994, + "C": 0.9189073443412781, + "D": 0.0012978191953152418 }, "sample": { "messages": [ @@ -30742,10 +30742,10 @@ ] }, "predict": { - "A": 0.9488484859466553, - "B": 0.003877726849168539, - "C": 0.0012589135440066457, - "D": 0.0026651201769709587 + "A": 0.9126138687133789, + "B": 0.0006088477093726397, + "C": 0.0005047524464316666, + "D": 0.0008321961504407227 }, "sample": { "messages": [ @@ -30780,17 +30780,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.13981802761554718, - "B": 0.8045972585678101, - "C": 0.00788799487054348, - "D": 0.005421333946287632 + "A": 0.5535526275634766, + "B": 0.3357466459274292, + "C": 0.008947344496846199, + "D": 0.011488618329167366 }, "sample": { "messages": [ @@ -30820,7 +30820,7 @@ "prompt_len": 58, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -30832,10 +30832,10 @@ ] }, "predict": { - "A": 0.007445597089827061, - "B": 0.009560337290167809, - "C": 0.9751808643341064, - "D": 0.00016449467511847615 + "A": 0.0031462388578802347, + "B": 0.0016840603202581406, + "C": 0.9885187745094299, + "D": 3.4951553971040994e-05 }, "sample": { "messages": [ @@ -30877,10 +30877,10 @@ ] }, "predict": { - "A": 0.06637062132358551, - "B": 0.0022710778284817934, - "C": 0.0009467260097153485, - "D": 0.916218101978302 + "A": 0.17529381811618805, + "B": 0.00036022139829583466, + "C": 0.00024757630308158696, + "D": 0.7856124043464661 }, "sample": { "messages": [ @@ -30922,10 +30922,10 @@ ] }, "predict": { - "A": 0.9595370888710022, - "B": 0.0018523423932492733, - "C": 0.00041331344982609153, - "D": 0.004443538375198841 + "A": 0.8649008870124817, + "B": 0.0003725496062543243, + "C": 7.8090641181916e-05, + "D": 0.001669651479460299 }, "sample": { "messages": [ @@ -30967,10 +30967,10 @@ ] }, "predict": { - "A": 0.9452422261238098, - "B": 0.007217013277113438, - "C": 0.002343021333217621, - "D": 0.000861949345562607 + "A": 0.879827618598938, + "B": 0.0004034212324768305, + "C": 0.00019056268502026796, + "D": 0.00013941864017397165 }, "sample": { "messages": [ @@ -31012,10 +31012,10 @@ ] }, "predict": { - "A": 0.01190763246268034, - "B": 0.01963236555457115, - "C": 0.0005569273489527404, - "D": 0.945940375328064 + "A": 0.0019975677132606506, + "B": 0.0015557074220851064, + "C": 0.00010586722055450082, + "D": 0.972072422504425 }, "sample": { "messages": [ @@ -31057,10 +31057,10 @@ ] }, "predict": { - "A": 0.004019877873361111, - "B": 0.006627658382058144, - "C": 0.9836317300796509, - "D": 0.000256982195423916 + "A": 0.0058587961830198765, + "B": 0.0035535397473722696, + "C": 0.9852980375289917, + "D": 6.928299990249798e-05 }, "sample": { "messages": [ @@ -31102,10 +31102,10 @@ ] }, "predict": { - "A": 0.004030010662972927, - "B": 0.0021571090910583735, - "C": 0.986111044883728, - "D": 0.00037484936183318496 + "A": 0.0016701160930097103, + "B": 0.0008939486579038203, + "C": 0.9803337454795837, + "D": 0.00017602891603019089 }, "sample": { "messages": [ @@ -31147,10 +31147,10 @@ ] }, "predict": { - "A": 0.1686868965625763, - "B": 0.27811771631240845, - "C": 0.005772148724645376, - "D": 0.5195922255516052 + "A": 0.10107871890068054, + "B": 0.037184782326221466, + "C": 0.0017391552682965994, + "D": 0.846321702003479 }, "sample": { "messages": [ @@ -31192,10 +31192,10 @@ ] }, "predict": { - "A": 0.3949921429157257, - "B": 0.0030158022418618202, - "C": 0.0008640417363494635, - "D": 0.5747101306915283 + "A": 0.1751376986503601, + "B": 0.0013371928362175822, + "C": 0.0009783111745491624, + "D": 0.784912645816803 }, "sample": { "messages": [ @@ -31237,10 +31237,10 @@ ] }, "predict": { - "A": 0.00025561722577549517, - "B": 0.9784072041511536, - "C": 0.00047755593550391495, - "D": 0.0012981315376237035 + "A": 0.00013509750715456903, + "B": 0.9075435400009155, + "C": 0.0007303302409127355, + "D": 0.0015461092116311193 }, "sample": { "messages": [ @@ -31282,10 +31282,10 @@ ] }, "predict": { - "A": 0.8363548517227173, - "B": 0.07779311388731003, - "C": 0.053466372191905975, - "D": 0.0016145446570590138 + "A": 0.9238330125808716, + "B": 0.0037754944059997797, + "C": 0.004278196021914482, + "D": 0.0002734959125518799 }, "sample": { "messages": [ @@ -31327,10 +31327,10 @@ ] }, "predict": { - "A": 0.0030817596707493067, - "B": 0.0027196432929486036, - "C": 0.003957057371735573, - "D": 0.9682600498199463 + "A": 0.003715541446581483, + "B": 0.0008290493860840797, + "C": 0.018869075924158096, + "D": 0.9091629981994629 }, "sample": { "messages": [ @@ -31372,10 +31372,10 @@ ] }, "predict": { - "A": 0.4619113802909851, - "B": 0.31746673583984375, - "C": 0.022997211664915085, - "D": 0.13233983516693115 + "A": 0.4310813248157501, + "B": 0.07491069287061691, + "C": 0.012228836305439472, + "D": 0.38042792677879333 }, "sample": { "messages": [ @@ -31417,10 +31417,10 @@ ] }, "predict": { - "A": 0.7901349067687988, - "B": 0.012771341018378735, - "C": 0.15558677911758423, - "D": 0.007746210787445307 + "A": 0.6090711951255798, + "B": 0.004103889688849449, + "C": 0.2877048850059509, + "D": 0.001418266212567687 }, "sample": { "messages": [ @@ -31462,10 +31462,10 @@ ] }, "predict": { - "A": 0.959308385848999, - "B": 0.0034598063211888075, - "C": 0.0007719871355220675, - "D": 0.002694500144571066 + "A": 0.7494534254074097, + "B": 0.0012767837615683675, + "C": 0.0006834130617789924, + "D": 0.0014467854052782059 }, "sample": { "messages": [ @@ -31507,10 +31507,10 @@ ] }, "predict": { - "A": 0.0007955501205287874, - "B": 0.9885889291763306, - "C": 4.7776502469787374e-05, - "D": 0.00015665309911128134 + "A": 0.00044864011579193175, + "B": 0.9191650152206421, + "C": 4.442138742888346e-05, + "D": 0.00022559041099157184 }, "sample": { "messages": [ @@ -31552,10 +31552,10 @@ ] }, "predict": { - "A": 0.016987137496471405, - "B": 0.9274662733078003, - "C": 0.028007054701447487, - "D": 0.011675077490508556 + "A": 0.01592033915221691, + "B": 0.7670848965644836, + "C": 0.1332993507385254, + "D": 0.01592033915221691 }, "sample": { "messages": [ @@ -31597,10 +31597,10 @@ ] }, "predict": { - "A": 0.49315887689590454, - "B": 0.49315887689590454, - "C": 0.00030907581094652414, - "D": 0.0001874639856396243 + "A": 0.7599543333053589, + "B": 0.19214655458927155, + "C": 0.00014525830920320004, + "D": 6.861517613288015e-05 }, "sample": { "messages": [ @@ -31635,17 +31635,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.2003297507762909, - "B": 0.42409807443618774, - "C": 0.330287903547287, - "D": 0.01280665211379528 + "A": 0.08175595849752426, + "B": 0.17307735979557037, + "C": 0.6845342516899109, + "D": 0.005922381766140461 }, "sample": { "messages": [ @@ -31675,7 +31675,7 @@ "prompt_len": 134, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -31687,10 +31687,10 @@ ] }, "predict": { - "A": 0.0006212056032381952, - "B": 0.0002589570067357272, - "C": 0.9911906719207764, - "D": 3.092800761805847e-05 + "A": 0.00027389833121560514, + "B": 5.066609810455702e-05, + "C": 0.984862208366394, + "D": 1.4516079318127595e-05 }, "sample": { "messages": [ @@ -31732,10 +31732,10 @@ ] }, "predict": { - "A": 0.004316793754696846, - "B": 0.004316793754696846, - "C": 0.9321678280830383, - "D": 0.05258931219577789 + "A": 0.001150182681158185, + "B": 0.0008414916810579598, + "C": 0.9823236465454102, + "D": 0.007500133477151394 }, "sample": { "messages": [ @@ -31777,10 +31777,10 @@ ] }, "predict": { - "A": 0.6760177612304688, - "B": 0.026212053373456, - "C": 0.00584869971498847, - "D": 0.2486930638551712 + "A": 0.5770129561424255, + "B": 0.0053141056559979916, + "C": 0.004689682275056839, + "D": 0.30885279178619385 }, "sample": { "messages": [ @@ -31822,10 +31822,10 @@ ] }, "predict": { - "A": 0.05870657041668892, - "B": 0.009002945385873318, - "C": 0.9183253049850464, - "D": 0.0009489033836871386 + "A": 0.05205283313989639, + "B": 0.0013871608534827828, + "C": 0.9226585030555725, + "D": 0.00019983947277069092 }, "sample": { "messages": [ @@ -31867,10 +31867,10 @@ ] }, "predict": { - "A": 0.041418131440877914, - "B": 0.003852486377581954, - "C": 0.9426723122596741, - "D": 0.0020620874129235744 + "A": 0.08217554539442062, + "B": 0.0006274180486798286, + "C": 0.8834704160690308, + "D": 0.016181325539946556 }, "sample": { "messages": [ @@ -31905,17 +31905,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.05434921756386757, - "B": 0.5156510472297668, - "C": 0.35440143942832947, - "D": 0.03735363855957985 + "A": 0.046913523226976395, + "B": 0.23824654519557953, + "C": 0.6476211547851562, + "D": 0.01114293746650219 }, "sample": { "messages": [ @@ -31945,7 +31945,7 @@ "prompt_len": 68, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -31957,10 +31957,10 @@ ] }, "predict": { - "A": 0.02767147123813629, - "B": 0.7136566638946533, - "C": 0.2316903918981552, - "D": 0.0033048861660063267 + "A": 0.020278973504900932, + "B": 0.46154728531837463, + "C": 0.46154728531837463, + "D": 0.0018862382275983691 }, "sample": { "messages": [ @@ -31990,7 +31990,7 @@ "prompt_len": 70, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -32002,10 +32002,10 @@ ] }, "predict": { - "A": 0.022601833567023277, - "B": 0.848127543926239, - "C": 0.11478158086538315, - "D": 0.001637271256186068 + "A": 0.030925601720809937, + "B": 0.903778612613678, + "C": 0.02729174867272377, + "D": 0.0011991157662123442 }, "sample": { "messages": [ @@ -32040,17 +32040,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.09130517393350601, - "B": 0.8662794232368469, - "C": 0.0035402860958129168, - "D": 0.02964245155453682 + "A": 0.6287668943405151, + "B": 0.2970084547996521, + "C": 0.005439899396151304, + "D": 0.02437993697822094 }, "sample": { "messages": [ @@ -32080,7 +32080,7 @@ "prompt_len": 94, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -32092,10 +32092,10 @@ ] }, "predict": { - "A": 0.002378404838964343, - "B": 0.9595170021057129, - "C": 0.013686774298548698, - "D": 0.001852303626947105 + "A": 0.002639570040628314, + "B": 0.8828153014183044, + "C": 0.034230463206768036, + "D": 0.0017042343970388174 }, "sample": { "messages": [ @@ -32137,10 +32137,10 @@ ] }, "predict": { - "A": 0.00025817894493229687, - "B": 0.988212525844574, - "C": 0.0001565934653626755, - "D": 0.0007952472660690546 + "A": 0.0002042824780801311, + "B": 0.9431718587875366, + "C": 9.649620915297419e-05, + "D": 0.0005216535646468401 }, "sample": { "messages": [ @@ -32175,17 +32175,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.18770192563533783, - "B": 0.4502735137939453, - "C": 0.05377750098705292, - "D": 0.2731046974658966 + "A": 0.4727119505405426, + "B": 0.056457389146089554, + "C": 0.093082495033741, + "D": 0.3248898684978485 }, "sample": { "messages": [ @@ -32215,7 +32215,7 @@ "prompt_len": 93, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -32227,10 +32227,10 @@ ] }, "predict": { - "A": 0.8808791637420654, - "B": 0.043856389820575714, - "C": 0.01828206330537796, - "D": 0.03415539115667343 + "A": 0.9267851114273071, + "B": 0.0021580858156085014, + "C": 0.004291866905987263, + "D": 0.019234810024499893 }, "sample": { "messages": [ @@ -32272,10 +32272,10 @@ ] }, "predict": { - "A": 0.6052311658859253, - "B": 0.012561171315610409, - "C": 0.3239569067955017, - "D": 0.016128864139318466 + "A": 0.6284160614013672, + "B": 0.002000110689550638, + "C": 0.2040168195962906, + "D": 0.0037366983015090227 }, "sample": { "messages": [ @@ -32317,10 +32317,10 @@ ] }, "predict": { - "A": 0.007425230927765369, - "B": 0.9725134372711182, - "C": 0.0012903118040412664, - "D": 0.0024106197524815798 + "A": 0.0012560959439724684, + "B": 0.9467248320579529, + "C": 0.0005573892267420888, + "D": 0.0016128593124449253 }, "sample": { "messages": [ @@ -32362,10 +32362,10 @@ ] }, "predict": { - "A": 0.0006024774047546089, - "B": 0.9613080620765686, - "C": 0.0002845902054104954, - "D": 0.001275444752536714 + "A": 0.00022069750411901623, + "B": 0.8992289304733276, + "C": 0.00019476484158076346, + "D": 0.0010528888087719679 }, "sample": { "messages": [ @@ -32407,10 +32407,10 @@ ] }, "predict": { - "A": 0.4872603416442871, - "B": 0.03114951215684414, - "C": 0.4300057590007782, - "D": 0.004215627908706665 + "A": 0.495492160320282, + "B": 0.002024963265284896, + "C": 0.3858896791934967, + "D": 0.0006574093131348491 }, "sample": { "messages": [ @@ -32452,10 +32452,10 @@ ] }, "predict": { - "A": 0.004498778842389584, - "B": 0.9714656472206116, - "C": 0.000885862042196095, - "D": 0.0030919623095542192 + "A": 0.006301213055849075, + "B": 0.9351829886436462, + "C": 0.0011656073620542884, + "D": 0.009168211370706558 }, "sample": { "messages": [ @@ -32497,10 +32497,10 @@ ] }, "predict": { - "A": 0.40915775299072266, - "B": 0.36108043789863586, - "C": 0.03358571231365204, - "D": 0.055373478680849075 + "A": 0.6422213315963745, + "B": 0.1116013303399086, + "C": 0.021975606679916382, + "D": 0.028217237442731857 }, "sample": { "messages": [ @@ -32535,17 +32535,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "B", - "C" + "B" ] }, "predict": { - "A": 0.2568757236003876, - "B": 0.2910783290863037, - "C": 0.3737519681453705, - "D": 0.05058182775974274 + "A": 0.08363662660121918, + "B": 0.4247418940067291, + "C": 0.4247418940067291, + "D": 0.0053467112593352795 }, "sample": { "messages": [ @@ -32587,10 +32587,10 @@ ] }, "predict": { - "A": 0.9359923601150513, - "B": 0.0015945753548294306, - "C": 0.0010959343053400517, - "D": 0.000277095619821921 + "A": 0.8347563743591309, + "B": 0.00020487402798607945, + "C": 0.0001322766038356349, + "D": 8.540419366909191e-05 }, "sample": { "messages": [ @@ -32632,10 +32632,10 @@ ] }, "predict": { - "A": 0.025511715561151505, - "B": 0.0018480625003576279, - "C": 0.0023729593958705664, - "D": 0.9573200345039368 + "A": 0.06547539681196213, + "B": 0.0006418981356546283, + "C": 0.001199223566800356, + "D": 0.9038599133491516 }, "sample": { "messages": [ @@ -32677,10 +32677,10 @@ ] }, "predict": { - "A": 0.0022929394617676735, - "B": 0.0009558394085615873, - "C": 0.984697699546814, - "D": 0.0005446209106594324 + "A": 0.00031104343361221254, + "B": 7.864408689783886e-05, + "C": 0.9870071411132812, + "D": 3.48981047864072e-05 }, "sample": { "messages": [ @@ -32722,10 +32722,10 @@ ] }, "predict": { - "A": 0.9682675004005432, - "B": 0.0014557313406839967, - "C": 6.396036769729108e-05, - "D": 6.396036769729108e-05 + "A": 0.9320611357688904, + "B": 0.0001673610822763294, + "C": 1.7639727957430296e-05, + "D": 1.1389064638933633e-05 }, "sample": { "messages": [ @@ -32767,10 +32767,10 @@ ] }, "predict": { - "A": 0.9257581830024719, - "B": 0.02177174761891365, - "C": 0.013205231167376041, - "D": 0.014963487163186073 + "A": 0.9107319116592407, + "B": 0.0032846122048795223, + "C": 0.012990892864763737, + "D": 0.006136463489383459 }, "sample": { "messages": [ @@ -32812,10 +32812,10 @@ ] }, "predict": { - "A": 0.07104281336069107, - "B": 0.1503976285457611, - "C": 0.7637822031974792, - "D": 0.0007892143330536783 + "A": 0.09566657245159149, + "B": 0.08442544937133789, + "C": 0.8010063767433167, + "D": 0.0006055421545170248 }, "sample": { "messages": [ @@ -32850,17 +32850,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.1574886441230774, - "B": 0.17845803499221802, - "C": 0.08429759740829468, - "D": 0.485099196434021 + "A": 0.26527896523475647, + "B": 0.1608998328447342, + "C": 0.2341078668832779, + "D": 0.2341078668832779 }, "sample": { "messages": [ @@ -32890,7 +32890,7 @@ "prompt_len": 67, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -32902,10 +32902,10 @@ ] }, "predict": { - "A": 0.9494311809539795, - "B": 0.01970481500029564, - "C": 0.0008657691651023924, - "D": 0.0018328333972021937 + "A": 0.946386992931366, + "B": 0.0043826415203511715, + "C": 0.0001925598189700395, + "D": 0.0010409685783088207 }, "sample": { "messages": [ @@ -32947,10 +32947,10 @@ ] }, "predict": { - "A": 0.002157687908038497, - "B": 0.0019041529158130288, - "C": 0.986375629901886, - "D": 0.0005124958115629852 + "A": 0.00037416061968542635, + "B": 0.00020027374557685107, + "C": 0.9842991828918457, + "D": 0.00022693988285027444 }, "sample": { "messages": [ @@ -32992,10 +32992,10 @@ ] }, "predict": { - "A": 0.0006973659037612379, - "B": 0.005838972982019186, - "C": 0.9819643497467041, - "D": 0.0016728939954191446 + "A": 0.00012236765178386122, + "B": 0.00070417724782601, + "C": 0.991555392742157, + "D": 0.0003769189352169633 }, "sample": { "messages": [ @@ -33030,17 +33030,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.04774073138833046, - "B": 0.5132612586021423, - "C": 0.3527589738368988, - "D": 0.054097335785627365 + "A": 0.024749046191573143, + "B": 0.02804434485733509, + "C": 0.9287011027336121, + "D": 0.007090719882398844 }, "sample": { "messages": [ @@ -33070,7 +33070,7 @@ "prompt_len": 78, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -33082,10 +33082,10 @@ ] }, "predict": { - "A": 0.012243646197021008, - "B": 0.005783488508313894, - "C": 0.9726333022117615, - "D": 0.0002106635511154309 + "A": 0.002167064230889082, + "B": 0.00025881887995637953, + "C": 0.9906619191169739, + "D": 2.261534427816514e-05 }, "sample": { "messages": [ @@ -33127,10 +33127,10 @@ ] }, "predict": { - "A": 0.00047494395403191447, - "B": 0.001139331143349409, - "C": 7.283501327037811e-05, - "D": 0.9730557799339294 + "A": 0.00022652771440334618, + "B": 0.000129071602714248, + "C": 2.7054875317844562e-05, + "D": 0.9229840636253357 }, "sample": { "messages": [ @@ -33172,10 +33172,10 @@ ] }, "predict": { - "A": 0.002407490974292159, - "B": 0.9712511897087097, - "C": 0.00016383182082790881, - "D": 0.00019761889416258782 + "A": 0.0019372256938368082, + "B": 0.8855924606323242, + "C": 0.00010929079144261777, + "D": 0.0002970831701532006 }, "sample": { "messages": [ @@ -33217,10 +33217,10 @@ ] }, "predict": { - "A": 0.0011573773808777332, - "B": 0.001908192876726389, - "C": 0.98846834897995, - "D": 3.960327376262285e-05 + "A": 0.0005458205123431981, + "B": 0.0007008473621681333, + "C": 0.9868665933609009, + "D": 1.5483736206078902e-05 }, "sample": { "messages": [ @@ -33262,10 +33262,10 @@ ] }, "predict": { - "A": 0.002437559887766838, - "B": 0.002762117423117161, - "C": 0.9833818674087524, - "D": 0.0003298878436908126 + "A": 0.0012985931243747473, + "B": 0.00044878179323859513, + "C": 0.9787551760673523, + "D": 7.326161721721292e-05 }, "sample": { "messages": [ @@ -33307,10 +33307,10 @@ ] }, "predict": { - "A": 0.0016738035483285785, - "B": 0.0010152130853384733, - "C": 0.9824982285499573, - "D": 0.004015245474874973 + "A": 0.0005236045690253377, + "B": 0.00018095289124175906, + "C": 0.946699321269989, + "D": 0.025228682905435562 }, "sample": { "messages": [ @@ -33352,10 +33352,10 @@ ] }, "predict": { - "A": 0.01974729634821415, - "B": 0.21230344474315643, - "C": 0.7410117983818054, - "D": 0.0007656857487745583 + "A": 0.010634852573275566, + "B": 0.017533907666802406, + "C": 0.9573188424110413, + "D": 0.00019478410831652582 }, "sample": { "messages": [ @@ -33397,10 +33397,10 @@ ] }, "predict": { - "A": 0.0067324331030249596, - "B": 0.8817747235298157, - "C": 0.0008040745160542428, - "D": 0.0929383710026741 + "A": 0.06337807327508926, + "B": 0.7721030116081238, + "C": 0.001586638973094523, + "D": 0.11840583384037018 }, "sample": { "messages": [ @@ -33442,10 +33442,10 @@ ] }, "predict": { - "A": 0.006482566241174936, - "B": 0.017621442675590515, - "C": 0.9620981812477112, - "D": 0.0006418609991669655 + "A": 0.005165965296328068, + "B": 0.0017853097524493933, + "C": 0.9844586849212646, + "D": 0.00010721619037212804 }, "sample": { "messages": [ @@ -33487,10 +33487,10 @@ ] }, "predict": { - "A": 0.028857706114649773, - "B": 0.9556359648704529, - "C": 0.0002496680535841733, - "D": 8.105535380309448e-05 + "A": 0.07038408517837524, + "B": 0.8574536442756653, + "C": 0.0006089415983296931, + "D": 0.00011264287604717538 }, "sample": { "messages": [ @@ -33532,10 +33532,10 @@ ] }, "predict": { - "A": 0.03360026702284813, - "B": 0.07113176584243774, - "C": 0.001672858721576631, - "D": 0.8665623068809509 + "A": 0.035756491124629974, + "B": 0.009040657430887222, + "C": 0.00032930588349699974, + "D": 0.9221720099449158 }, "sample": { "messages": [ @@ -33577,10 +33577,10 @@ ] }, "predict": { - "A": 0.009252951480448246, - "B": 0.015255537815392017, - "C": 0.006359454244375229, - "D": 0.9438266754150391 + "A": 0.02141321264207363, + "B": 0.0034956110175698996, + "C": 0.003084865864366293, + "D": 0.9105129837989807 }, "sample": { "messages": [ @@ -33622,10 +33622,10 @@ ] }, "predict": { - "A": 0.028629638254642487, - "B": 0.007238705642521381, - "C": 0.003017541952431202, - "D": 0.9480834007263184 + "A": 0.013757656328380108, + "B": 0.000996601302176714, + "C": 0.0013621924445033073, + "D": 0.9644861817359924 }, "sample": { "messages": [ @@ -33667,10 +33667,10 @@ ] }, "predict": { - "A": 0.0008916441584005952, - "B": 0.0014700726605951786, - "C": 0.0001000396950985305, - "D": 0.9778065085411072 + "A": 0.00023046982823871076, + "B": 0.0002033888886217028, + "C": 4.263262962922454e-05, + "D": 0.9390461444854736 }, "sample": { "messages": [ @@ -33712,10 +33712,10 @@ ] }, "predict": { - "A": 0.20438425242900848, - "B": 0.26243460178375244, - "C": 0.49029234051704407, - "D": 0.016776882112026215 + "A": 0.07237915694713593, + "B": 0.018300315365195274, + "C": 0.8817586302757263, + "D": 0.0033852183260023594 }, "sample": { "messages": [ @@ -33753,14 +33753,14 @@ "acc": false, "f1_macro": [ "B", - "C" + "A" ] }, "predict": { - "A": 0.1404000073671341, - "B": 0.03132747858762741, - "C": 0.8079463243484497, - "D": 0.002913909498602152 + "A": 0.5093133449554443, + "B": 0.011977901682257652, + "C": 0.3500455915927887, + "D": 0.002358588855713606 }, "sample": { "messages": [ @@ -33790,7 +33790,7 @@ "prompt_len": 88, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -33802,10 +33802,10 @@ ] }, "predict": { - "A": 0.07821138948202133, - "B": 0.05375384911894798, - "C": 0.8408516645431519, - "D": 0.004999886732548475 + "A": 0.10313661396503448, + "B": 0.00959319993853569, + "C": 0.8635522723197937, + "D": 0.0012982982443645597 }, "sample": { "messages": [ @@ -33847,10 +33847,10 @@ ] }, "predict": { - "A": 0.963305652141571, - "B": 0.00505496421828866, - "C": 0.00036618037847802043, - "D": 0.00047018492477945983 + "A": 0.8801690936088562, + "B": 0.00037912625703029335, + "C": 2.746381505858153e-05, + "D": 5.814089308842085e-05 }, "sample": { "messages": [ @@ -33892,10 +33892,10 @@ ] }, "predict": { - "A": 0.004546968266367912, - "B": 0.9818716645240784, - "C": 0.00017630508227739483, - "D": 0.00019977983902208507 + "A": 0.008127770386636257, + "B": 0.93944251537323, + "C": 0.0002305671077920124, + "D": 0.00029605402960442007 }, "sample": { "messages": [ @@ -33930,17 +33930,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "B" + "D" ] }, "predict": { - "A": 0.10348251461982727, - "B": 0.5255275964736938, - "C": 0.005152091849595308, - "D": 0.3187486231327057 + "A": 0.11401455104351044, + "B": 0.0691533163189888, + "C": 0.0014352314174175262, + "D": 0.7434682250022888 }, "sample": { "messages": [ @@ -33970,7 +33970,7 @@ "prompt_len": 70, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -33982,10 +33982,10 @@ ] }, "predict": { - "A": 0.0014780546771362424, - "B": 0.9831157326698303, - "C": 7.358801667578518e-05, - "D": 0.00020003295503556728 + "A": 0.000817824387922883, + "B": 0.8968533277511597, + "C": 6.306384602794424e-05, + "D": 0.00013350615336094052 }, "sample": { "messages": [ @@ -34027,10 +34027,10 @@ ] }, "predict": { - "A": 0.9649126529693604, - "B": 0.0011297964956611395, - "C": 0.00013493496226146817, - "D": 0.00013493496226146817 + "A": 0.8801324963569641, + "B": 0.0002299421321367845, + "C": 5.130702629685402e-05, + "D": 6.188809493323788e-05 }, "sample": { "messages": [ @@ -34072,10 +34072,10 @@ ] }, "predict": { - "A": 0.8817278742790222, - "B": 0.034188296645879745, - "C": 0.02349725179374218, - "D": 0.0007095555192790926 + "A": 0.851172685623169, + "B": 0.006498783361166716, + "C": 0.02570318803191185, + "D": 0.0002089030749630183 }, "sample": { "messages": [ @@ -34110,17 +34110,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "C" + "A" ] }, "predict": { - "A": 0.3326675593852997, - "B": 0.004745247308164835, - "C": 0.6215047836303711, - "D": 0.00023625197354704142 + "A": 0.716234564781189, + "B": 0.0013826580252498388, + "C": 0.0754905641078949, + "D": 6.883848982397467e-05 }, "sample": { "messages": [ @@ -34150,7 +34150,7 @@ "prompt_len": 75, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -34162,10 +34162,10 @@ ] }, "predict": { - "A": 0.12269726395606995, - "B": 0.039833974093198776, - "C": 0.10827996581792831, - "D": 0.706074059009552 + "A": 0.3711197078227997, + "B": 0.014389872550964355, + "C": 0.04432392865419388, + "D": 0.5399760007858276 }, "sample": { "messages": [ @@ -34207,10 +34207,10 @@ ] }, "predict": { - "A": 0.8303940296173096, - "B": 0.01952899619936943, - "C": 0.0364849679172039, - "D": 0.0601535439491272 + "A": 0.8179712295532227, + "B": 0.0013090834254398942, + "C": 0.016976449638605118, + "D": 0.0014833856839686632 }, "sample": { "messages": [ @@ -34252,10 +34252,10 @@ ] }, "predict": { - "A": 0.07879466563463211, - "B": 0.05415473133325577, - "C": 0.8471225500106812, - "D": 0.0039229560643434525 + "A": 0.08912663161754608, + "B": 0.032787855714559555, + "C": 0.8456099629402161, + "D": 0.0015335084171965718 }, "sample": { "messages": [ @@ -34297,10 +34297,10 @@ ] }, "predict": { - "A": 0.0016698924591764808, - "B": 0.9802024960517883, - "C": 0.0003726033610291779, - "D": 0.0013005133951082826 + "A": 0.0007923246594145894, + "B": 0.9249280095100403, + "C": 0.00017679150914773345, + "D": 0.0010829793754965067 }, "sample": { "messages": [ @@ -34342,10 +34342,10 @@ ] }, "predict": { - "A": 0.08947894722223282, - "B": 0.03730037808418274, - "C": 0.8489526510238647, - "D": 0.005720197688788176 + "A": 0.015560446307063103, + "B": 0.0039342972449958324, + "C": 0.9626907706260681, + "D": 0.001199892838485539 }, "sample": { "messages": [ @@ -34380,17 +34380,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.2164597362279892, - "B": 0.5883985161781311, - "C": 0.022814685478806496, - "D": 0.14877043664455414 + "A": 0.4237119257450104, + "B": 0.29121267795562744, + "C": 0.023904193192720413, + "D": 0.22679665684700012 }, "sample": { "messages": [ @@ -34420,7 +34420,7 @@ "prompt_len": 101, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -34432,10 +34432,10 @@ ] }, "predict": { - "A": 0.007803103420883417, - "B": 0.003055739216506481, - "C": 0.06533458828926086, - "D": 0.9019160866737366 + "A": 0.004307765047997236, + "B": 0.00022830319358035922, + "C": 0.004046770744025707, + "D": 0.9302181601524353 }, "sample": { "messages": [ @@ -34477,10 +34477,10 @@ ] }, "predict": { - "A": 0.01928422600030899, - "B": 0.02185189165174961, - "C": 0.9291660785675049, - "D": 0.0170182716101408 + "A": 0.00840992946177721, + "B": 0.003972569480538368, + "C": 0.9720556735992432, + "D": 0.002126363106071949 }, "sample": { "messages": [ @@ -34522,10 +34522,10 @@ ] }, "predict": { - "A": 0.04148728400468826, - "B": 0.003005328821018338, - "C": 0.944246232509613, - "D": 0.001176903722807765 + "A": 0.036865267902612686, + "B": 0.0006752109038643539, + "C": 0.9507677555084229, + "D": 0.0003189470444340259 }, "sample": { "messages": [ @@ -34567,10 +34567,10 @@ ] }, "predict": { - "A": 0.0019117274787276983, - "B": 0.0007032852154225111, - "C": 0.9902993440628052, - "D": 6.96346687618643e-05 + "A": 0.0013047548709437251, + "B": 0.00014638944412581623, + "C": 0.9833992719650269, + "D": 1.5429333870997652e-05 }, "sample": { "messages": [ @@ -34612,10 +34612,10 @@ ] }, "predict": { - "A": 0.2149316370487213, - "B": 0.017642663791775703, - "C": 0.7501850724220276, - "D": 0.0012780303368344903 + "A": 0.14379847049713135, + "B": 0.002984442515298724, + "C": 0.8275031447410583, + "D": 0.00027759638032875955 }, "sample": { "messages": [ @@ -34657,10 +34657,10 @@ ] }, "predict": { - "A": 0.0011607015039771795, - "B": 0.0005482765845954418, - "C": 0.9913073182106018, - "D": 3.5050150472670794e-05 + "A": 0.0007451142882928252, + "B": 0.00016625746502541006, + "C": 0.9856312274932861, + "D": 1.546435487398412e-05 }, "sample": { "messages": [ @@ -34695,17 +34695,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "B" ] }, "predict": { - "A": 0.463753342628479, - "B": 0.463753342628479, - "C": 0.026163175702095032, - "D": 0.02308891899883747 + "A": 0.35381773114204407, + "B": 0.5833468437194824, + "C": 0.01210697740316391, + "D": 0.009428923018276691 }, "sample": { "messages": [ @@ -34735,7 +34735,7 @@ "prompt_len": 71, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " B" } } { @@ -34747,10 +34747,10 @@ ] }, "predict": { - "A": 0.012210480868816376, - "B": 0.004491984844207764, - "C": 0.0005364910466596484, - "D": 0.9699985980987549 + "A": 0.012133312411606312, + "B": 0.0021084535401314497, + "C": 0.0001347889337921515, + "D": 0.9638683795928955 }, "sample": { "messages": [ @@ -34792,10 +34792,10 @@ ] }, "predict": { - "A": 0.970252513885498, - "B": 0.0010025595547631383, - "C": 0.0011360488133504987, - "D": 0.00022370128135662526 + "A": 0.8802269697189331, + "B": 7.465929229510948e-05, + "C": 0.00021603386267088354, + "D": 4.820366666535847e-05 }, "sample": { "messages": [ @@ -34837,10 +34837,10 @@ ] }, "predict": { - "A": 0.0017544942675158381, - "B": 0.8020577430725098, - "C": 0.004769209306687117, - "D": 0.1579345166683197 + "A": 0.0015439526177942753, + "B": 0.7513306140899658, + "C": 0.0073657869361341, + "D": 0.16764453053474426 }, "sample": { "messages": [ @@ -34882,10 +34882,10 @@ ] }, "predict": { - "A": 0.0016294424422085285, - "B": 0.006444569211453199, - "C": 0.009376793168485165, - "D": 0.9564588665962219 + "A": 0.0009115726570598781, + "B": 0.0019297994440421462, + "C": 0.010432397946715355, + "D": 0.9390944838523865 }, "sample": { "messages": [ @@ -34927,10 +34927,10 @@ ] }, "predict": { - "A": 0.41915518045425415, - "B": 0.006775012705475092, - "C": 0.5382059216499329, - "D": 0.0024923880118876696 + "A": 0.3101153075695038, + "B": 0.001844012993387878, + "C": 0.5793716311454773, + "D": 0.00041145490831695497 }, "sample": { "messages": [ @@ -34972,10 +34972,10 @@ ] }, "predict": { - "A": 0.9539715051651001, - "B": 0.012008728459477425, - "C": 0.0008699094178155065, - "D": 0.001265710685402155 + "A": 0.9298614859580994, + "B": 0.0013979901559650898, + "C": 0.0003534673014655709, + "D": 0.0005142918671481311 }, "sample": { "messages": [ @@ -35017,10 +35017,10 @@ ] }, "predict": { - "A": 0.007153588347136974, - "B": 0.9369352459907532, - "C": 0.0014086251612752676, - "D": 0.0023224304895848036 + "A": 0.010285443626344204, + "B": 0.8170740008354187, + "C": 0.0008987320470623672, + "D": 0.002294992795214057 }, "sample": { "messages": [ @@ -35062,10 +35062,10 @@ ] }, "predict": { - "A": 0.00450655072927475, - "B": 0.9731439352035522, - "C": 0.00212874379940331, - "D": 0.0005382306990213692 + "A": 0.008990894071757793, + "B": 0.9170960783958435, + "C": 0.0020061396062374115, + "D": 0.0005747685790993273 }, "sample": { "messages": [ @@ -35107,10 +35107,10 @@ ] }, "predict": { - "A": 0.05698473006486893, - "B": 0.8913912177085876, - "C": 0.023754769936203957, - "D": 0.003642912255600095 + "A": 0.12250637263059616, + "B": 0.7988418340682983, + "C": 0.006099232472479343, + "D": 0.0018601608462631702 }, "sample": { "messages": [ @@ -35152,10 +35152,10 @@ ] }, "predict": { - "A": 0.017274310812354088, - "B": 0.025133974850177765, - "C": 0.9431454539299011, - "D": 0.004949172958731651 + "A": 0.010824920609593391, + "B": 0.0035143368877470493, + "C": 0.9744282364845276, + "D": 0.0010068743722513318 }, "sample": { "messages": [ @@ -35197,10 +35197,10 @@ ] }, "predict": { - "A": 0.21955718100070953, - "B": 0.06290418654680252, - "C": 0.6762837171554565, - "D": 0.009646669030189514 + "A": 0.11484506726264954, + "B": 0.004453026689589024, + "C": 0.8485966920852661, + "D": 0.001445686211809516 }, "sample": { "messages": [ @@ -35242,10 +35242,10 @@ ] }, "predict": { - "A": 0.039887744933366776, - "B": 0.011428031139075756, - "C": 0.007854362949728966, - "D": 0.9078409075737 + "A": 0.02230129949748516, + "B": 0.0017196914413943887, + "C": 0.006002312991768122, + "D": 0.9482753872871399 }, "sample": { "messages": [ @@ -35287,10 +35287,10 @@ ] }, "predict": { - "A": 0.9174618721008301, - "B": 0.03139381855726242, - "C": 0.006181809585541487, - "D": 0.006181809585541487 + "A": 0.8963202834129333, + "B": 0.0036630562972277403, + "C": 0.0032326357904821634, + "D": 0.0008173383539542556 }, "sample": { "messages": [ @@ -35332,10 +35332,10 @@ ] }, "predict": { - "A": 0.001481204992160201, - "B": 0.9852111339569092, - "C": 0.0002004593115998432, - "D": 0.001018016366288066 + "A": 0.0008670924580655992, + "B": 0.9508823752403259, + "C": 0.00015067806816659868, + "D": 0.0011133687803521752 }, "sample": { "messages": [ @@ -35377,10 +35377,10 @@ ] }, "predict": { - "A": 0.09919380396604538, - "B": 0.3462204337120056, - "C": 0.5037477612495422, - "D": 0.001414923812262714 + "A": 0.19632169604301453, + "B": 0.04380529001355171, + "C": 0.685230016708374, + "D": 0.0007537116180174053 }, "sample": { "messages": [ @@ -35422,10 +35422,10 @@ ] }, "predict": { - "A": 0.053631413727998734, - "B": 0.838936448097229, - "C": 0.007258222438395023, - "D": 0.06886409968137741 + "A": 0.08117076009511948, + "B": 0.7701267600059509, + "C": 0.005879989825189114, + "D": 0.10422532260417938 }, "sample": { "messages": [ @@ -35467,10 +35467,10 @@ ] }, "predict": { - "A": 0.8859862089157104, - "B": 0.07272617518901825, - "C": 0.0017103558639064431, - "D": 0.00281990016810596 + "A": 0.897068977355957, + "B": 0.005334167275577784, + "C": 0.0004378551384434104, + "D": 0.0011181022273376584 }, "sample": { "messages": [ @@ -35512,10 +35512,10 @@ ] }, "predict": { - "A": 0.08381202071905136, - "B": 0.9010641574859619, - "C": 0.0019710699561983347, - "D": 0.004172754939645529 + "A": 0.05163862928748131, + "B": 0.9153165221214294, + "C": 0.00226884288713336, + "D": 0.0033011469058692455 }, "sample": { "messages": [ @@ -35557,10 +35557,10 @@ ] }, "predict": { - "A": 0.0012700462248176336, - "B": 0.9572392106056213, - "C": 0.008281742222607136, - "D": 0.013654284179210663 + "A": 0.0003339886025059968, + "B": 0.9352853298187256, + "C": 0.0014061445835977793, + "D": 0.004331230651587248 }, "sample": { "messages": [ @@ -35602,10 +35602,10 @@ ] }, "predict": { - "A": 0.1747468262910843, - "B": 0.22437934577465057, - "C": 0.47501105070114136, - "D": 0.10598929971456528 + "A": 0.022540545091032982, + "B": 0.003916959278285503, + "C": 0.9584482908248901, + "D": 0.0001518769859103486 }, "sample": { "messages": [ @@ -35640,17 +35640,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "C" ] }, "predict": { - "A": 0.0032428449485450983, - "B": 0.011318640783429146, - "C": 0.25761109590530396, - "D": 0.7002595067024231 + "A": 0.0009627596591599286, + "B": 0.0014911512844264507, + "C": 0.4987218379974365, + "D": 0.4401204586029053 }, "sample": { "messages": [ @@ -35680,7 +35680,7 @@ "prompt_len": 105, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " C" } } { @@ -35692,10 +35692,10 @@ ] }, "predict": { - "A": 0.9429792165756226, - "B": 0.003001296194270253, - "C": 0.009244643151760101, - "D": 0.0012511262902989984 + "A": 0.8715589642524719, + "B": 0.00042540382128208876, + "C": 0.008544464595615864, + "D": 0.0009586622472852468 }, "sample": { "messages": [ @@ -35737,10 +35737,10 @@ ] }, "predict": { - "A": 0.20601102709770203, - "B": 0.7190490961074829, - "C": 0.02171340212225914, - "D": 0.016910415142774582 + "A": 0.24194663763046265, + "B": 0.6576790809631348, + "C": 0.013649693690240383, + "D": 0.0327439121901989 }, "sample": { "messages": [ @@ -35782,10 +35782,10 @@ ] }, "predict": { - "A": 0.738791286945343, - "B": 0.06064368411898613, - "C": 0.16484662890434265, - "D": 0.000594529730733484 + "A": 0.7903740406036377, + "B": 0.018587816506624222, + "C": 0.09439671784639359, + "D": 0.00012524370686151087 }, "sample": { "messages": [ @@ -35820,17 +35820,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.23126593232154846, - "B": 0.3364899456501007, - "C": 0.3364899456501007, - "D": 0.06625879555940628 + "A": 0.3377424478530884, + "B": 0.08539465069770813, + "C": 0.49141234159469604, + "D": 0.013095694594085217 }, "sample": { "messages": [ @@ -35872,10 +35872,10 @@ ] }, "predict": { - "A": 0.02262682467699051, - "B": 0.004455486312508583, - "C": 0.9621170163154602, - "D": 0.002104622544720769 + "A": 0.008500885218381882, + "B": 0.0006555179134011269, + "C": 0.9825687408447266, + "D": 0.00039759170613251626 }, "sample": { "messages": [ @@ -35917,10 +35917,10 @@ ] }, "predict": { - "A": 0.007422024384140968, - "B": 0.0035059163346886635, - "C": 0.0010044617811217904, - "D": 0.972093403339386 + "A": 0.007361264433711767, + "B": 0.000879177765455097, + "C": 0.000879177765455097, + "D": 0.964135468006134 }, "sample": { "messages": [ @@ -35962,10 +35962,10 @@ ] }, "predict": { - "A": 0.11915204674005508, - "B": 0.03868301212787628, - "C": 0.6856727600097656, - "D": 0.13501696288585663 + "A": 0.0908549427986145, + "B": 0.005125684663653374, + "C": 0.8620076775550842, + "D": 0.01393305603414774 }, "sample": { "messages": [ @@ -36007,10 +36007,10 @@ ] }, "predict": { - "A": 0.724594235420227, - "B": 0.03183645009994507, - "C": 0.2075997292995453, - "D": 0.0023062247782945633 + "A": 0.6367613077163696, + "B": 0.0062425886280834675, + "C": 0.2654415965080261, + "D": 0.0007936560432426631 }, "sample": { "messages": [ @@ -36052,10 +36052,10 @@ ] }, "predict": { - "A": 0.005464076995849609, - "B": 0.9189165234565735, - "C": 0.0025810475926846266, - "D": 0.058744367212057114 + "A": 0.0013678418472409248, + "B": 0.9098085761070251, + "C": 0.001133980811573565, + "D": 0.06590631604194641 }, "sample": { "messages": [ @@ -36097,10 +36097,10 @@ ] }, "predict": { - "A": 0.9585336446762085, - "B": 0.004438891541212797, - "C": 0.0005301499622873962, - "D": 0.000680726021528244 + "A": 0.9135648012161255, + "B": 0.0005378660862334073, + "C": 0.00021063137683086097, + "D": 0.0001447646936867386 }, "sample": { "messages": [ @@ -36142,10 +36142,10 @@ ] }, "predict": { - "A": 0.01381869986653328, - "B": 0.9687656760215759, - "C": 0.0007795977289788425, - "D": 0.0006071513053029776 + "A": 0.012692718766629696, + "B": 0.8898283243179321, + "C": 0.000978757394477725, + "D": 0.0009194574668072164 }, "sample": { "messages": [ @@ -36187,10 +36187,10 @@ ] }, "predict": { - "A": 0.00575048103928566, - "B": 0.009480941109359264, - "C": 0.0011323369108140469, - "D": 0.9670822620391846 + "A": 0.0025567857082933187, + "B": 0.0009405888267792761, + "C": 0.00018521292076911777, + "D": 0.9689866900444031 }, "sample": { "messages": [ @@ -36232,10 +36232,10 @@ ] }, "predict": { - "A": 0.00020187294285278767, - "B": 0.00042736504110507667, - "C": 0.9921588897705078, - "D": 7.42649135645479e-05 + "A": 5.385503391153179e-05, + "B": 7.36111105652526e-05, + "C": 0.9834242463111877, + "D": 2.5439318051212467e-05 }, "sample": { "messages": [ @@ -36277,10 +36277,10 @@ ] }, "predict": { - "A": 0.0906209722161293, - "B": 0.03777644410729408, - "C": 0.0016597810899838805, - "D": 0.8597878813743591 + "A": 0.022207103669643402, + "B": 0.011886605992913246, + "C": 0.0004906188114546239, + "D": 0.9442700147628784 }, "sample": { "messages": [ @@ -36322,10 +36322,10 @@ ] }, "predict": { - "A": 0.005809110589325428, - "B": 0.9769421815872192, - "C": 0.0012961877509951591, - "D": 0.0005403314717113972 + "A": 0.017197344452142715, + "B": 0.9389431476593018, + "C": 0.0023274074774235487, + "D": 0.0005884607089683414 }, "sample": { "messages": [ @@ -36367,10 +36367,10 @@ ] }, "predict": { - "A": 0.007480768021196127, - "B": 0.005141451954841614, - "C": 0.9797873497009277, - "D": 0.0021432761568576097 + "A": 0.001163796172477305, + "B": 0.0002764258242677897, + "C": 0.993950366973877, + "D": 0.00020223745377734303 }, "sample": { "messages": [ @@ -36405,17 +36405,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "D" + "C" ] }, "predict": { - "A": 0.16423839330673218, - "B": 0.022227246314287186, - "C": 0.3476926386356354, - "D": 0.4464462101459503 + "A": 0.10553049296140671, + "B": 0.009221152402460575, + "C": 0.6072860956192017, + "D": 0.2531545162200928 }, "sample": { "messages": [ @@ -36445,7 +36445,7 @@ "prompt_len": 98, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " C" } } { @@ -36457,10 +36457,10 @@ ] }, "predict": { - "A": 0.9464205503463745, - "B": 0.010513783432543278, - "C": 0.000977934105321765, - "D": 0.0020702865440398455 + "A": 0.9281277656555176, + "B": 0.002030271338298917, + "C": 0.0005816824268549681, + "D": 0.001086725853383541 }, "sample": { "messages": [ @@ -36502,10 +36502,10 @@ ] }, "predict": { - "A": 0.9428779482841492, - "B": 0.0038533268962055445, - "C": 0.015240202657878399, - "D": 0.00016930335550568998 + "A": 0.8784376382827759, + "B": 0.0005171847878955305, + "C": 0.0016957836924120784, + "D": 2.740979107329622e-05 }, "sample": { "messages": [ @@ -36547,10 +36547,10 @@ ] }, "predict": { - "A": 0.009427043609321117, - "B": 0.019957050681114197, - "C": 0.9615845680236816, - "D": 0.000322575680911541 + "A": 0.008328568190336227, + "B": 0.015559814870357513, + "C": 0.9626516103744507, + "D": 0.0003229336580261588 }, "sample": { "messages": [ @@ -36592,10 +36592,10 @@ ] }, "predict": { - "A": 0.17963221669197083, - "B": 0.7104592323303223, - "C": 0.010134155862033367, - "D": 0.08485225588083267 + "A": 0.40456917881965637, + "B": 0.519477128982544, + "C": 0.006539252121001482, + "D": 0.025863241404294968 }, "sample": { "messages": [ @@ -36637,10 +36637,10 @@ ] }, "predict": { - "A": 0.00925533752888441, - "B": 0.011884087696671486, - "C": 0.004954025149345398, - "D": 0.944070041179657 + "A": 0.01023587305098772, + "B": 0.005146919749677181, + "C": 0.001778727862983942, + "D": 0.9214038252830505 }, "sample": { "messages": [ @@ -36682,10 +36682,10 @@ ] }, "predict": { - "A": 0.0024289207067340612, - "B": 0.00275232782587409, - "C": 0.9798965454101562, - "D": 0.0001552757021272555 + "A": 0.0012239457573741674, + "B": 0.0006973831914365292, + "C": 0.9819886684417725, + "D": 7.350365922320634e-05 }, "sample": { "messages": [ @@ -36727,10 +36727,10 @@ ] }, "predict": { - "A": 0.004522266797721386, - "B": 0.007455956656485796, - "C": 0.0006120221805758774, - "D": 0.9765376448631287 + "A": 0.0027215539012104273, + "B": 0.001065775752067566, + "C": 0.00036832227488048375, + "D": 0.968940258026123 }, "sample": { "messages": [ @@ -36772,10 +36772,10 @@ ] }, "predict": { - "A": 0.02194111794233322, - "B": 0.02817295305430889, - "C": 0.9329600930213928, - "D": 0.0015894094249233603 + "A": 0.05655223876237869, + "B": 0.020804407075047493, + "C": 0.8846258521080017, + "D": 0.002815570216625929 }, "sample": { "messages": [ @@ -36817,10 +36817,10 @@ ] }, "predict": { - "A": 0.9729171991348267, - "B": 0.002128247870132327, - "C": 0.0006097531295381486, - "D": 0.0008871856844052672 + "A": 0.9388121962547302, + "B": 0.00035686971386894584, + "C": 0.00020333821885287762, + "D": 0.00035686971386894584 }, "sample": { "messages": [ @@ -36862,10 +36862,10 @@ ] }, "predict": { - "A": 0.020054679363965988, - "B": 0.0016461884370073676, - "C": 0.9662885665893555, - "D": 0.003484980668872595 + "A": 0.006642370484769344, + "B": 0.00031066781957633793, + "C": 0.985815167427063, + "D": 0.00042463254067115486 }, "sample": { "messages": [ @@ -36907,10 +36907,10 @@ ] }, "predict": { - "A": 0.0014623210299760103, - "B": 0.0051040020771324635, - "C": 0.000609586073551327, - "D": 0.9726506471633911 + "A": 0.0010029206750914454, + "B": 0.0012877758126705885, + "C": 0.00015380287368316203, + "D": 0.9117962121963501 }, "sample": { "messages": [ @@ -36952,10 +36952,10 @@ ] }, "predict": { - "A": 0.018679361790418625, - "B": 0.06519738584756851, - "C": 0.0004977881326340139, - "D": 0.9000220894813538 + "A": 0.013391519896686077, + "B": 0.01719505339860916, + "C": 0.0001685743627604097, + "D": 0.938818097114563 }, "sample": { "messages": [ @@ -36997,10 +36997,10 @@ ] }, "predict": { - "A": 0.00032921231468208134, - "B": 8.773211447987705e-06, - "C": 4.144171725783963e-06, - "D": 0.9813681244850159 + "A": 0.0002800891816150397, + "B": 5.8130649449594785e-06, + "C": 2.7458975182526046e-06, + "D": 0.9461041688919067 }, "sample": { "messages": [ @@ -37042,10 +37042,10 @@ ] }, "predict": { - "A": 0.002387109911069274, - "B": 0.9630288481712341, - "C": 0.015565911307930946, - "D": 0.0018590829567983747 + "A": 0.001352103310637176, + "B": 0.8993402719497681, + "C": 0.023966606706380844, + "D": 0.002229241654276848 }, "sample": { "messages": [ @@ -37087,10 +37087,10 @@ ] }, "predict": { - "A": 0.9098391532897949, - "B": 0.0659085288643837, - "C": 0.0008296659216284752, - "D": 0.004213391803205013 + "A": 0.9481024146080017, + "B": 0.008202692493796349, + "C": 0.00018122108303941786, + "D": 0.0006325237918645144 }, "sample": { "messages": [ @@ -37132,10 +37132,10 @@ ] }, "predict": { - "A": 0.9469719529151917, - "B": 0.0014237146824598312, - "C": 0.00046221254160627723, - "D": 0.010519908741116524 + "A": 0.8646339178085327, + "B": 0.00016526684339623898, + "C": 0.00014584748714696616, + "D": 0.0016691361088305712 }, "sample": { "messages": [ @@ -37173,14 +37173,14 @@ "acc": false, "f1_macro": [ "D", - "A" + "C" ] }, "predict": { - "A": 0.3900478184223175, - "B": 0.344215989112854, - "C": 0.11175057291984558, - "D": 0.1266299933195114 + "A": 0.3593364655971527, + "B": 0.0551060289144516, + "C": 0.5228314399719238, + "D": 0.0039918674156069756 }, "sample": { "messages": [ @@ -37210,7 +37210,7 @@ "prompt_len": 74, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -37222,10 +37222,10 @@ ] }, "predict": { - "A": 0.9526143670082092, - "B": 0.005664451979100704, - "C": 0.0002488786412868649, - "D": 0.0006765223224647343 + "A": 0.8922367095947266, + "B": 0.00040911108953878284, + "C": 0.00012477184645831585, + "D": 9.128502279054374e-05 }, "sample": { "messages": [ @@ -37267,10 +37267,10 @@ ] }, "predict": { - "A": 0.39097002148628235, - "B": 0.5688579678535461, - "C": 0.007160865236073732, - "D": 0.007160865236073732 + "A": 0.050705958157777786, + "B": 0.8987845778465271, + "C": 0.002687317319214344, + "D": 0.005344368051737547 }, "sample": { "messages": [ @@ -37312,10 +37312,10 @@ ] }, "predict": { - "A": 0.0004262834554538131, - "B": 0.0004262834554538131, - "C": 0.9896479249000549, - "D": 0.00013839398161508143 + "A": 0.00035091073368676007, + "B": 8.872414036886767e-05, + "C": 0.9826732873916626, + "D": 2.7059333660872653e-05 }, "sample": { "messages": [ @@ -37357,10 +37357,10 @@ ] }, "predict": { - "A": 0.03629103675484657, - "B": 0.0033755924087017775, - "C": 0.0007531964802183211, - "D": 0.935958206653595 + "A": 0.019560789689421654, + "B": 0.0004321541346143931, + "C": 0.0004321541346143931, + "D": 0.9424915909767151 }, "sample": { "messages": [ @@ -37402,10 +37402,10 @@ ] }, "predict": { - "A": 0.3295559585094452, - "B": 0.07353387027978897, - "C": 0.5433459281921387, - "D": 0.0036610360257327557 + "A": 0.21142442524433136, + "B": 0.005634269677102566, + "C": 0.7379437685012817, + "D": 0.00040814513340592384 }, "sample": { "messages": [ @@ -37440,17 +37440,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "B" ] }, "predict": { - "A": 0.7911532521247864, - "B": 0.1765301525592804, - "C": 0.002518067369237542, - "D": 0.00533074839040637 + "A": 0.3934207558631897, + "B": 0.505162239074707, + "C": 0.005611845757812262, + "D": 0.009252369403839111 }, "sample": { "messages": [ @@ -37480,7 +37480,7 @@ "prompt_len": 69, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " B" } } { @@ -37492,10 +37492,10 @@ ] }, "predict": { - "A": 0.04057803377509117, - "B": 0.8150316476821899, - "C": 0.02788884937763214, - "D": 0.052103228867053986 + "A": 0.21806271374225616, + "B": 0.6716804504394531, + "C": 0.003110497957095504, + "D": 0.04865637049078941 }, "sample": { "messages": [ @@ -37537,10 +37537,10 @@ ] }, "predict": { - "A": 0.0007561867823824286, - "B": 0.04678361862897873, - "C": 6.607487011933699e-05, - "D": 0.9396741390228271 + "A": 0.0002875582140404731, + "B": 0.04542987793684006, + "C": 3.030841071449686e-05, + "D": 0.9124835133552551 }, "sample": { "messages": [ @@ -37582,10 +37582,10 @@ ] }, "predict": { - "A": 0.00025681298575364053, - "B": 0.9829840660095215, - "C": 0.001015715068206191, - "D": 0.00215026899240911 + "A": 0.00013937479525338858, + "B": 0.9362770318984985, + "C": 0.001807439955882728, + "D": 0.0008537741960026324 }, "sample": { "messages": [ @@ -37627,10 +37627,10 @@ ] }, "predict": { - "A": 0.00037770860944874585, - "B": 0.0002941597776953131, - "C": 0.9936328530311584, - "D": 0.0007056525791995227 + "A": 4.7591656766599044e-05, + "B": 4.7591656766599044e-05, + "C": 0.9847641587257385, + "D": 9.464730828767642e-05 }, "sample": { "messages": [ @@ -37672,10 +37672,10 @@ ] }, "predict": { - "A": 0.9579080939292908, - "B": 0.010641397908329964, - "C": 0.001121595036238432, - "D": 0.002690566936507821 + "A": 0.8757398724555969, + "B": 0.00357894878834486, + "C": 0.0005488495226018131, + "D": 0.0007985713891685009 }, "sample": { "messages": [ @@ -37717,10 +37717,10 @@ ] }, "predict": { - "A": 0.05232422053813934, - "B": 0.20694629848003387, - "C": 0.7223135828971863, - "D": 0.0037903536576777697 + "A": 0.023075465112924576, + "B": 0.17050591111183167, + "C": 0.7641544342041016, + "D": 0.0016715811798349023 }, "sample": { "messages": [ @@ -37762,10 +37762,10 @@ ] }, "predict": { - "A": 0.014614351093769073, - "B": 0.0030633346177637577, - "C": 0.051009099930524826, - "D": 0.9041579365730286 + "A": 0.0024314606562256813, + "B": 0.00047878295299597085, + "C": 0.027826592326164246, + "D": 0.9214901924133301 }, "sample": { "messages": [ @@ -37807,10 +37807,10 @@ ] }, "predict": { - "A": 0.003965637646615505, - "B": 0.9703595638275146, - "C": 0.00041797515586949885, - "D": 0.0011361741926521063 + "A": 0.0033611683174967766, + "B": 0.9319588541984558, + "C": 0.00024348219449166209, + "D": 0.0017991038039326668 }, "sample": { "messages": [ @@ -37852,10 +37852,10 @@ ] }, "predict": { - "A": 0.010771816596388817, - "B": 0.0030861771665513515, - "C": 0.969648003578186, - "D": 0.005765737500041723 + "A": 0.0016737614059820771, + "B": 0.0003508394001983106, + "C": 0.9824734926223755, + "D": 0.0008958998369053006 }, "sample": { "messages": [ @@ -37897,10 +37897,10 @@ ] }, "predict": { - "A": 0.002744368277490139, - "B": 0.0001867565733846277, - "C": 0.0003953636914957315, - "D": 0.9770627021789551 + "A": 0.0023520069662481546, + "B": 5.531389615498483e-05, + "C": 0.0003183094959240407, + "D": 0.9488672614097595 }, "sample": { "messages": [ @@ -37942,10 +37942,10 @@ ] }, "predict": { - "A": 0.943274736404419, - "B": 0.0014181560836732388, - "C": 9.065969061339274e-05, - "D": 0.00011640934098977596 + "A": 0.9238899946212769, + "B": 6.496498099295422e-05, + "C": 1.981319837796036e-05, + "D": 9.359091563965194e-06 }, "sample": { "messages": [ @@ -37987,10 +37987,10 @@ ] }, "predict": { - "A": 0.9185711145401001, - "B": 0.01906433515250683, - "C": 0.0037539901677519083, - "D": 0.009005354717373848 + "A": 0.900800883769989, + "B": 0.001970493933185935, + "C": 0.0007716564578004181, + "D": 0.0008214240660890937 }, "sample": { "messages": [ @@ -38032,10 +38032,10 @@ ] }, "predict": { - "A": 0.968841552734375, - "B": 0.005760942120105028, - "C": 0.0010011016856878996, - "D": 0.0006880464497953653 + "A": 0.9037460684776306, + "B": 0.0004998478107154369, + "C": 0.00016227681771852076, + "D": 8.159791468642652e-05 }, "sample": { "messages": [ @@ -38077,10 +38077,10 @@ ] }, "predict": { - "A": 0.6445060968399048, - "B": 0.019462399184703827, - "C": 0.0004039291525259614, - "D": 0.3044431507587433 + "A": 0.666878342628479, + "B": 0.0029011513106524944, + "C": 7.731306686764583e-05, + "D": 0.2779962420463562 }, "sample": { "messages": [ @@ -38122,10 +38122,10 @@ ] }, "predict": { - "A": 0.004022618755698204, - "B": 0.0011524994624778628, - "C": 0.0002913975331466645, - "D": 0.984302282333374 + "A": 0.004518849775195122, + "B": 0.0003273443435318768, + "C": 0.0003273443435318768, + "D": 0.9757997393608093 }, "sample": { "messages": [ @@ -38167,10 +38167,10 @@ ] }, "predict": { - "A": 0.0007962100789882243, - "B": 0.0001383605704177171, - "C": 0.00015678304771427065, - "D": 0.9894089698791504 + "A": 0.0005633776308968663, + "B": 1.5981793694663793e-05, + "C": 4.922739026369527e-05, + "D": 0.9568961262702942 }, "sample": { "messages": [ @@ -38212,10 +38212,10 @@ ] }, "predict": { - "A": 0.8268645405769348, - "B": 0.02203519456088543, - "C": 0.0010970677249133587, - "D": 0.11190395057201385 + "A": 0.8525861501693726, + "B": 0.0015461597358807921, + "C": 0.00017347431275993586, + "D": 0.07930286228656769 }, "sample": { "messages": [ @@ -38257,10 +38257,10 @@ ] }, "predict": { - "A": 0.0031162903178483248, - "B": 0.9791092872619629, - "C": 7.3288130806759e-05, - "D": 0.0008928321185521781 + "A": 0.003849984845146537, + "B": 0.9420601725578308, + "C": 9.638247865950689e-05, + "D": 0.001036209287121892 }, "sample": { "messages": [ @@ -38302,10 +38302,10 @@ ] }, "predict": { - "A": 0.0017586868489161134, - "B": 0.07478126883506775, - "C": 0.9110223054885864, - "D": 0.00039241608465090394 + "A": 0.0006676170160062611, + "B": 0.041304003447294235, + "C": 0.9400748014450073, + "D": 0.0002614425902720541 }, "sample": { "messages": [ @@ -38347,10 +38347,10 @@ ] }, "predict": { - "A": 0.9577797651290894, - "B": 0.0198780857026577, - "C": 0.0012707634596154094, - "D": 0.0016316927503794432 + "A": 0.9347152709960938, + "B": 0.0033711097203195095, + "C": 0.0006235925829969347, + "D": 0.0009658390772528946 }, "sample": { "messages": [ @@ -38392,10 +38392,10 @@ ] }, "predict": { - "A": 0.002382709411904216, - "B": 0.0044514876790344715, - "C": 0.015537216328084469, - "D": 0.9612535834312439 + "A": 0.0003117227752227336, + "B": 0.0005823747487738729, + "C": 0.028060389682650566, + "D": 0.9292324781417847 }, "sample": { "messages": [ @@ -38430,17 +38430,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "D" + "A" ] }, "predict": { - "A": 0.26063039898872375, - "B": 0.10864691436290741, - "C": 0.1791284829378128, - "D": 0.3792150020599365 + "A": 0.4637312889099121, + "B": 0.0553848072886467, + "C": 0.10347244143486023, + "D": 0.2812672555446625 }, "sample": { "messages": [ @@ -38470,7 +38470,7 @@ "prompt_len": 71, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -38482,10 +38482,10 @@ ] }, "predict": { - "A": 0.9751371741294861, - "B": 0.002417123643681407, - "C": 0.0005393331521190703, - "D": 0.000784725125413388 + "A": 0.9314718246459961, + "B": 0.00031247397419065237, + "C": 0.00016725526074878871, + "D": 0.00015712178719695657 }, "sample": { "messages": [ @@ -38527,10 +38527,10 @@ ] }, "predict": { - "A": 0.33764874935150146, - "B": 0.6308109164237976, - "C": 0.008998033590614796, - "D": 0.0029212343506515026 + "A": 0.2599593698978424, + "B": 0.706642746925354, + "C": 0.0025485516525804996, + "D": 0.0009375597001053393 }, "sample": { "messages": [ @@ -38572,10 +38572,10 @@ ] }, "predict": { - "A": 0.06593065708875656, - "B": 0.0008299444452859461, - "C": 0.003719553118571639, - "D": 0.9101446270942688 + "A": 0.04053064063191414, + "B": 0.00019979932403657585, + "C": 0.0016728992341086268, + "D": 0.922473132610321 }, "sample": { "messages": [ @@ -38617,10 +38617,10 @@ ] }, "predict": { - "A": 0.02222958207130432, - "B": 0.9452258348464966, - "C": 0.01527815219014883, - "D": 0.0014210895169526339 + "A": 0.03151676803827286, + "B": 0.921055018901825, + "C": 0.003536083735525608, + "D": 0.016869710758328438 }, "sample": { "messages": [ @@ -38662,10 +38662,10 @@ ] }, "predict": { - "A": 0.019864121451973915, - "B": 0.8446438908576965, - "C": 0.0026883166283369064, - "D": 0.0611858032643795 + "A": 0.016969485208392143, + "B": 0.8176356554031372, + "C": 0.0022965704556554556, + "D": 0.059229329228401184 }, "sample": { "messages": [ @@ -38707,10 +38707,10 @@ ] }, "predict": { - "A": 0.00453433720394969, - "B": 0.9791441559791565, - "C": 0.00042175903217867017, - "D": 0.00453433720394969 + "A": 0.0006780342082493007, + "B": 0.9547432661056519, + "C": 0.0001512898743385449, + "D": 0.001117889303714037 }, "sample": { "messages": [ @@ -38752,10 +38752,10 @@ ] }, "predict": { - "A": 0.004517339635640383, - "B": 0.9754736423492432, - "C": 0.0012942393077537417, - "D": 0.0006927563808858395 + "A": 0.011469228193163872, + "B": 0.9111136794090271, + "C": 0.004781085532158613, + "D": 0.0011356074828654528 }, "sample": { "messages": [ @@ -38797,10 +38797,10 @@ ] }, "predict": { - "A": 0.949327290058136, - "B": 0.009306877851486206, - "C": 0.008213290013372898, - "D": 0.002076644916087389 + "A": 0.8976761698722839, + "B": 0.002225116826593876, + "C": 0.0041570658795535564, + "D": 0.0005625975900329649 }, "sample": { "messages": [ @@ -38842,10 +38842,10 @@ ] }, "predict": { - "A": 0.43744295835494995, - "B": 0.01921990141272545, - "C": 0.014968475326895714, - "D": 0.4956878125667572 + "A": 0.2787870764732361, + "B": 0.0024119808804243803, + "C": 0.005106163211166859, + "D": 0.6687754392623901 }, "sample": { "messages": [ @@ -38887,10 +38887,10 @@ ] }, "predict": { - "A": 0.009888313710689545, - "B": 0.08279383182525635, - "C": 0.002833049278706312, - "D": 0.8901175856590271 + "A": 0.008060787804424763, + "B": 0.024828974157571793, + "C": 0.0016896327724680305, + "D": 0.9317003488540649 }, "sample": { "messages": [ @@ -38932,10 +38932,10 @@ ] }, "predict": { - "A": 0.32126736640930176, - "B": 0.2208036333322525, - "C": 0.2208036333322525, - "D": 0.17196203768253326 + "A": 0.6595607995986938, + "B": 0.07877330482006073, + "C": 0.06951719522476196, + "D": 0.08926184475421906 }, "sample": { "messages": [ @@ -38977,10 +38977,10 @@ ] }, "predict": { - "A": 0.3267640471458435, - "B": 0.023670706897974014, - "C": 0.007684753742069006, - "D": 0.6104755997657776 + "A": 0.41409575939178467, + "B": 0.00914856605231762, + "C": 0.005906758829951286, + "D": 0.5317094922065735 }, "sample": { "messages": [ @@ -39022,10 +39022,10 @@ ] }, "predict": { - "A": 0.30558326840400696, - "B": 0.1635669469833374, - "C": 0.04135619476437569, - "D": 0.4446210265159607 + "A": 0.11162232607603073, + "B": 0.01333138532936573, + "C": 0.012523678131401539, + "D": 0.8247836232185364 }, "sample": { "messages": [ @@ -39067,10 +39067,10 @@ ] }, "predict": { - "A": 0.66204434633255, - "B": 0.2149343341588974, - "C": 0.009443555027246475, - "D": 0.061579715460538864 + "A": 0.6207627654075623, + "B": 0.13851089775562286, + "C": 0.007814249955117702, + "D": 0.030905956402420998 }, "sample": { "messages": [ @@ -39112,10 +39112,10 @@ ] }, "predict": { - "A": 0.93446946144104, - "B": 0.021976616233587265, - "C": 0.0007519984501414001, - "D": 0.002974211471155286 + "A": 0.9140793085098267, + "B": 0.006979080848395824, + "C": 0.0008335324237123132, + "D": 0.0019995402544736862 }, "sample": { "messages": [ @@ -39157,10 +39157,10 @@ ] }, "predict": { - "A": 0.0019091179128736258, - "B": 0.0010218771640211344, - "C": 0.9889475107192993, - "D": 0.0001299171126447618 + "A": 0.0003100666799582541, + "B": 9.456499537918717e-05, + "C": 0.9839076995849609, + "D": 2.1100302546983585e-05 }, "sample": { "messages": [ @@ -39202,10 +39202,10 @@ ] }, "predict": { - "A": 0.003959168214350939, - "B": 0.968776524066925, - "C": 0.0008834098698571324, - "D": 0.000688000291120261 + "A": 0.004802305717021227, + "B": 0.9151574969291687, + "C": 0.0006918377475813031, + "D": 0.0008345156093128026 }, "sample": { "messages": [ @@ -39247,10 +39247,10 @@ ] }, "predict": { - "A": 0.7256300449371338, - "B": 0.02191212959587574, - "C": 0.20789650082588196, - "D": 0.002309521660208702 + "A": 0.7356387376785278, + "B": 0.0030063875019550323, + "C": 0.14485585689544678, + "D": 0.00040687029832042754 }, "sample": { "messages": [ @@ -39292,10 +39292,10 @@ ] }, "predict": { - "A": 9.479364234721288e-05, - "B": 0.9862866997718811, - "C": 0.0001562882971484214, - "D": 0.00029198499396443367 + "A": 4.4334497943054885e-05, + "B": 0.917367160320282, + "C": 0.00012828611943405122, + "D": 0.00027158172451891005 }, "sample": { "messages": [ @@ -39337,10 +39337,10 @@ ] }, "predict": { - "A": 0.00037550562410615385, - "B": 0.0006191040738485754, - "C": 0.00048215879360213876, - "D": 0.9878374338150024 + "A": 0.00038878462510183454, + "B": 0.0004992093308828771, + "C": 0.0005656782886944711, + "D": 0.9608038067817688 }, "sample": { "messages": [ @@ -39382,10 +39382,10 @@ ] }, "predict": { - "A": 0.1167772114276886, - "B": 0.0065881190821528435, - "C": 0.862873375415802, - "D": 0.0004772417596541345 + "A": 0.10329537093639374, + "B": 0.0013002967461943626, + "C": 0.8648815155029297, + "D": 0.0002405307168373838 }, "sample": { "messages": [ @@ -39427,10 +39427,10 @@ ] }, "predict": { - "A": 0.8811125159263611, - "B": 0.056327637284994125, - "C": 0.007623116951435804, - "D": 0.008638123981654644 + "A": 0.8461813926696777, + "B": 0.009400226175785065, + "C": 0.003918597474694252, + "D": 0.007320903241634369 }, "sample": { "messages": [ @@ -39472,10 +39472,10 @@ ] }, "predict": { - "A": 0.06282874196767807, - "B": 0.02967819571495056, - "C": 0.8673240542411804, - "D": 0.012371712364256382 + "A": 0.14101389050483704, + "B": 0.03146445378661156, + "C": 0.7161276936531067, + "D": 0.011575126089155674 }, "sample": { "messages": [ @@ -39517,10 +39517,10 @@ ] }, "predict": { - "A": 0.9386466145515442, - "B": 0.019480988383293152, - "C": 0.009202168323099613, - "D": 0.0023266724310815334 + "A": 0.9011442065238953, + "B": 0.0022337131667882204, + "C": 0.011343731544911861, + "D": 0.00026677901041693985 }, "sample": { "messages": [ @@ -39562,10 +39562,10 @@ ] }, "predict": { - "A": 0.8884726762771606, - "B": 0.06436074525117874, - "C": 0.0028278138488531113, - "D": 0.01118422020226717 + "A": 0.9011300802230835, + "B": 0.00883436854928732, + "C": 0.0011956017697229981, + "D": 0.0023777380120009184 }, "sample": { "messages": [ @@ -39607,10 +39607,10 @@ ] }, "predict": { - "A": 0.0031393703538924456, - "B": 0.0009574537398293614, - "C": 0.9863607883453369, - "D": 0.0003108397650066763 + "A": 0.0007476421887986362, + "B": 0.0001220492267748341, + "C": 0.9889751672744751, + "D": 0.00015671430446673185 }, "sample": { "messages": [ @@ -39652,10 +39652,10 @@ ] }, "predict": { - "A": 0.944063663482666, - "B": 0.007208014838397503, - "C": 0.010487601161003113, - "D": 0.0034048252273350954 + "A": 0.8798776268959045, + "B": 0.009774558246135712, + "C": 0.018261276185512543, + "D": 0.006717948243021965 }, "sample": { "messages": [ @@ -39697,10 +39697,10 @@ ] }, "predict": { - "A": 0.11467596888542175, - "B": 0.012086757458746433, - "C": 0.008307099342346191, - "D": 0.8473471403121948 + "A": 0.06603355705738068, + "B": 0.0016531177097931504, + "C": 0.0014588714111596346, + "D": 0.9115651249885559 }, "sample": { "messages": [ @@ -39742,10 +39742,10 @@ ] }, "predict": { - "A": 0.0027433994691818953, - "B": 0.9767177700996399, - "C": 0.00015477198758162558, - "D": 0.00015477198758162558 + "A": 0.0017082643462345004, + "B": 0.884902834892273, + "C": 0.00014926647418178618, + "D": 0.000123746256576851 }, "sample": { "messages": [ @@ -39787,10 +39787,10 @@ ] }, "predict": { - "A": 0.7150726914405823, - "B": 0.1408061534166336, - "C": 0.031418103724718094, - "D": 0.08540325611829758 + "A": 0.7342414259910583, + "B": 0.03655572980642319, + "C": 0.01047339104115963, + "D": 0.1638314127922058 }, "sample": { "messages": [ @@ -39832,10 +39832,10 @@ ] }, "predict": { - "A": 0.009396139532327652, - "B": 0.009396139532327652, - "C": 0.0026920391246676445, - "D": 0.9584321975708008 + "A": 0.004376685246825218, + "B": 0.007215933408588171, + "C": 0.00046129923430271447, + "D": 0.9451007843017578 }, "sample": { "messages": [ @@ -39877,10 +39877,10 @@ ] }, "predict": { - "A": 0.001316634239628911, - "B": 0.0007985790143720806, - "C": 0.9923527836799622, - "D": 0.00022879672178532928 + "A": 0.0007976058404892683, + "B": 0.00013860312174074352, + "C": 0.9911434650421143, + "D": 3.5044355172431096e-05 }, "sample": { "messages": [ @@ -39922,10 +39922,10 @@ ] }, "predict": { - "A": 0.1091189906001091, - "B": 0.2617628574371338, - "C": 0.015720082446932793, - "D": 0.5541519522666931 + "A": 0.2346818894147873, + "B": 0.2659294307231903, + "C": 0.05236461013555527, + "D": 0.34146013855934143 }, "sample": { "messages": [ @@ -39967,10 +39967,10 @@ ] }, "predict": { - "A": 0.8130722045898438, - "B": 0.011597851291298866, - "C": 0.12468866258859634, - "D": 0.0013851659605279565 + "A": 0.8996325731277466, + "B": 0.0013525428948923945, + "C": 0.014541206881403923, + "D": 0.0001948522694874555 }, "sample": { "messages": [ @@ -40012,10 +40012,10 @@ ] }, "predict": { - "A": 0.42241159081459045, - "B": 0.0017262997571378946, - "C": 0.0010470537235960364, - "D": 0.5423872470855713 + "A": 0.10847227275371552, + "B": 0.00047189186443574727, + "C": 0.0009384690201841295, + "D": 0.8015077114105225 }, "sample": { "messages": [ @@ -40057,10 +40057,10 @@ ] }, "predict": { - "A": 0.9735696315765381, - "B": 0.003978756722062826, - "C": 0.00037008203798905015, - "D": 0.0004193578497506678 + "A": 0.9318852424621582, + "B": 0.00020183788728900254, + "C": 7.904085941845551e-05, + "D": 6.155708251753822e-05 }, "sample": { "messages": [ @@ -40102,10 +40102,10 @@ ] }, "predict": { - "A": 0.03622055426239967, - "B": 0.9341403841972351, - "C": 0.0023155026137828827, - "D": 0.0133247971534729 + "A": 0.005015233997255564, + "B": 0.9557344317436218, + "C": 0.0012680497020483017, + "D": 0.005683004390448332 }, "sample": { "messages": [ @@ -40147,10 +40147,10 @@ ] }, "predict": { - "A": 0.003949740435928106, - "B": 0.020058436319231987, - "C": 0.9664695858955383, - "D": 0.0009986506775021553 + "A": 0.0035159364342689514, + "B": 0.008434292860329151, + "C": 0.9748716950416565, + "D": 0.0005739608313888311 }, "sample": { "messages": [ @@ -40192,10 +40192,10 @@ ] }, "predict": { - "A": 0.006605919450521469, - "B": 0.980405330657959, - "C": 0.0011479365639388561, - "D": 0.0010130505543202162 + "A": 0.00635090796276927, + "B": 0.9425583481788635, + "C": 0.0010367572540417314, + "D": 0.0006693807663396001 }, "sample": { "messages": [ @@ -40237,10 +40237,10 @@ ] }, "predict": { - "A": 0.05899006873369217, - "B": 0.922760009765625, - "C": 0.001387313473969698, - "D": 0.001224299892783165 + "A": 0.08058200776576996, + "B": 0.8663382530212402, + "C": 0.001475911121815443, + "D": 0.0008951852796599269 }, "sample": { "messages": [ @@ -40282,10 +40282,10 @@ ] }, "predict": { - "A": 0.004017700906842947, - "B": 0.0021505204495042562, - "C": 0.9830990433692932, - "D": 0.0002734079898800701 + "A": 0.0031370026990771294, + "B": 0.00033063761657103896, + "C": 0.9856168627738953, + "D": 6.930541712790728e-05 }, "sample": { "messages": [ @@ -40320,17 +40320,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "D" ] }, "predict": { - "A": 0.017139863222837448, - "B": 0.7288054823875427, - "C": 0.019422007724642754, - "D": 0.20880627632141113 + "A": 0.025100627914071083, + "B": 0.2698571979999542, + "C": 0.036521200090646744, + "D": 0.6473537683486938 }, "sample": { "messages": [ @@ -40360,7 +40360,7 @@ "prompt_len": 63, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -40372,10 +40372,10 @@ ] }, "predict": { - "A": 0.00037819327553734183, - "B": 0.0002024322748184204, - "C": 0.994907796382904, - "D": 0.000178645845153369 + "A": 0.00010755627590697259, + "B": 4.483612428884953e-05, + "C": 0.9875813722610474, + "D": 3.081539034610614e-05 }, "sample": { "messages": [ @@ -40417,10 +40417,10 @@ ] }, "predict": { - "A": 0.8704720139503479, - "B": 0.014069869183003902, - "C": 0.0058651939034461975, - "D": 0.06305678188800812 + "A": 0.8258052468299866, + "B": 0.0023195170797407627, + "C": 0.0038242372684180737, + "D": 0.004333428107202053 }, "sample": { "messages": [ @@ -40462,10 +40462,10 @@ ] }, "predict": { - "A": 0.04987754300236702, - "B": 0.044016774743795395, - "C": 0.0024832566268742085, - "D": 0.8841005563735962 + "A": 0.01062536146491766, + "B": 0.0050190649926662445, + "C": 0.0009284336701966822, + "D": 0.9564645290374756 }, "sample": { "messages": [ @@ -40507,10 +40507,10 @@ ] }, "predict": { - "A": 0.0021453220397233963, - "B": 0.9807226061820984, - "C": 0.000789219862781465, - "D": 0.0004224389558658004 + "A": 0.004177432507276535, + "B": 0.9020742177963257, + "C": 0.0007727472693659365, + "D": 0.00036501995055004954 }, "sample": { "messages": [ @@ -40552,10 +40552,10 @@ ] }, "predict": { - "A": 0.0318576917052269, - "B": 0.30225735902786255, - "C": 0.002615038538351655, - "D": 0.6398788094520569 + "A": 0.047612838447093964, + "B": 0.1661851555109024, + "C": 0.0018461502622812986, + "D": 0.7447901368141174 }, "sample": { "messages": [ @@ -40597,10 +40597,10 @@ ] }, "predict": { - "A": 0.007345546502619982, - "B": 0.006482422351837158, - "C": 0.007345546502619982, - "D": 0.9620767831802368 + "A": 0.0030411300249397755, + "B": 0.0008712983108125627, + "C": 0.0022249380126595497, + "D": 0.9554945826530457 }, "sample": { "messages": [ @@ -40642,10 +40642,10 @@ ] }, "predict": { - "A": 0.06210128217935562, - "B": 0.8572817444801331, - "C": 0.020161334425210953, - "D": 0.006545426324009895 + "A": 0.202708438038826, + "B": 0.48627227544784546, + "C": 0.12294888496398926, + "D": 0.13931934535503387 }, "sample": { "messages": [ @@ -40687,10 +40687,10 @@ ] }, "predict": { - "A": 0.05817730724811554, - "B": 0.014709527604281902, - "C": 0.9100462794303894, - "D": 0.0015503729227930307 + "A": 0.028695711866021156, + "B": 0.0006339708925224841, + "C": 0.950271487236023, + "D": 0.0006748584564775229 }, "sample": { "messages": [ @@ -40732,10 +40732,10 @@ ] }, "predict": { - "A": 0.006333444733172655, - "B": 0.011832431890070438, - "C": 0.9399664998054504, - "D": 0.0055892448872327805 + "A": 0.0005130782956257463, + "B": 0.00035263318568468094, + "C": 0.9874967336654663, + "D": 0.0006188905099406838 }, "sample": { "messages": [ @@ -40777,10 +40777,10 @@ ] }, "predict": { - "A": 0.0014619971625506878, - "B": 0.9724351763725281, - "C": 0.0001746106572682038, - "D": 0.0016566599952057004 + "A": 0.007964059710502625, + "B": 0.9205201268196106, + "C": 0.0004782789619639516, + "D": 0.007964059710502625 }, "sample": { "messages": [ @@ -40822,10 +40822,10 @@ ] }, "predict": { - "A": 0.01680009998381138, - "B": 0.9172543287277222, - "C": 0.0037486087530851364, - "D": 0.0355658084154129 + "A": 0.09334621578454971, + "B": 0.7815783023834229, + "C": 0.007198086939752102, + "D": 0.07269810140132904 }, "sample": { "messages": [ @@ -40867,10 +40867,10 @@ ] }, "predict": { - "A": 0.45878201723098755, - "B": 0.3572998046875, - "C": 0.1489447057247162, - "D": 0.003969242330640554 + "A": 0.32874518632888794, + "B": 0.32874518632888794, + "C": 0.2560270130634308, + "D": 0.007731345947831869 }, "sample": { "messages": [ @@ -40912,10 +40912,10 @@ ] }, "predict": { - "A": 0.3569963276386261, - "B": 0.5194265842437744, - "C": 0.002405422506853938, - "D": 0.09026280045509338 + "A": 0.41399839520454407, + "B": 0.5315844416618347, + "C": 0.0010923835216090083, + "D": 0.011032682843506336 }, "sample": { "messages": [ @@ -40957,10 +40957,10 @@ ] }, "predict": { - "A": 0.0026964587159454823, - "B": 0.9600057601928711, - "C": 0.009411565959453583, - "D": 0.003923323936760426 + "A": 0.002281319350004196, + "B": 0.9203498363494873, + "C": 0.010224164463579655, + "D": 0.002929271897301078 }, "sample": { "messages": [ @@ -41002,10 +41002,10 @@ ] }, "predict": { - "A": 0.16004180908203125, - "B": 0.8127594590187073, - "C": 0.0033215621951967478, - "D": 0.009028942324221134 + "A": 0.0905296579003334, + "B": 0.8589215278625488, + "C": 0.00309775467030704, + "D": 0.009541756473481655 }, "sample": { "messages": [ @@ -41047,10 +41047,10 @@ ] }, "predict": { - "A": 0.00116126355715096, - "B": 0.0009043930331245065, - "C": 0.9917873740196228, - "D": 0.00017808553820941597 + "A": 0.00048429216258227825, + "B": 0.00016736688849050552, + "C": 0.9922083020210266, + "D": 5.1044018618995324e-05 }, "sample": { "messages": [ @@ -41092,10 +41092,10 @@ ] }, "predict": { - "A": 0.04421863704919815, - "B": 0.03443751111626625, - "C": 0.8881551027297974, - "D": 0.01118022296577692 + "A": 0.021779809147119522, + "B": 0.011657890863716602, + "C": 0.9261009693145752, + "D": 0.02467975579202175 }, "sample": { "messages": [ @@ -41137,10 +41137,10 @@ ] }, "predict": { - "A": 0.008543548174202442, - "B": 0.194450244307518, - "C": 0.0014846459962427616, - "D": 0.7690656185150146 + "A": 0.012874477542936802, + "B": 0.138413667678833, + "C": 0.0014444763073697686, + "D": 0.796515703201294 }, "sample": { "messages": [ @@ -41182,10 +41182,10 @@ ] }, "predict": { - "A": 0.0005346778780221939, - "B": 0.0005022833938710392, - "C": 0.003950764890760183, - "D": 0.9667202830314636 + "A": 0.00025893989368341863, + "B": 5.427674841484986e-05, + "C": 0.0002756400790531188, + "D": 0.9310757517814636 }, "sample": { "messages": [ @@ -41227,10 +41227,10 @@ ] }, "predict": { - "A": 0.6674776673316956, - "B": 0.03323175385594368, - "C": 0.029326921328902245, - "D": 0.24555133283138275 + "A": 0.8212800025939941, + "B": 0.0038032811135053635, + "C": 0.011714929714798927, + "D": 0.12594737112522125 }, "sample": { "messages": [ @@ -41272,10 +41272,10 @@ ] }, "predict": { - "A": 0.03688136115670204, - "B": 0.011973625048995018, - "C": 0.1872989982366562, - "D": 0.7407819628715515 + "A": 0.019000651314854622, + "B": 0.0020026538986712694, + "C": 0.040224380791187286, + "D": 0.915502667427063 }, "sample": { "messages": [ @@ -41317,10 +41317,10 @@ ] }, "predict": { - "A": 0.00020220226724632084, - "B": 0.00020220226724632084, - "C": 0.9937773942947388, - "D": 0.00017844286048784852 + "A": 7.371279207291082e-05, + "B": 5.39294233021792e-05, + "C": 0.9847826361656189, + "D": 4.470906424103305e-05 }, "sample": { "messages": [ @@ -41355,17 +41355,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.31813621520996094, - "B": 0.36049556732177734, - "C": 0.2807542383670807, - "D": 0.002752417465671897 + "A": 0.6450801491737366, + "B": 0.11209811270236969, + "C": 0.14393684267997742, + "D": 0.0026362950447946787 }, "sample": { "messages": [ @@ -41395,7 +41395,7 @@ "prompt_len": 71, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -41407,10 +41407,10 @@ ] }, "predict": { - "A": 0.001265207538381219, - "B": 0.0005976416869089007, - "C": 0.00030051314388401806, - "D": 0.9535922408103943 + "A": 0.0003559531760402024, + "B": 7.942389493109658e-05, + "C": 6.185538950376213e-05, + "D": 0.8796674013137817 }, "sample": { "messages": [ @@ -41452,10 +41452,10 @@ ] }, "predict": { - "A": 0.8510561585426331, - "B": 0.04801330715417862, - "C": 0.06165030598640442, - "D": 0.0016429249662905931 + "A": 0.8129352927207947, + "B": 0.02166399173438549, + "C": 0.035717885941267014, + "D": 0.0014742531348019838 }, "sample": { "messages": [ @@ -41497,10 +41497,10 @@ ] }, "predict": { - "A": 0.004003872629255056, - "B": 0.005825600121170282, - "C": 0.9797153472900391, - "D": 0.002428471576422453 + "A": 0.0013120868243277073, + "B": 0.0010218542302027345, + "C": 0.9889253973960876, + "D": 0.0009017832344397902 }, "sample": { "messages": [ @@ -41542,10 +41542,10 @@ ] }, "predict": { - "A": 0.027996566146612167, - "B": 0.9271188974380493, - "C": 0.004865073598921299, - "D": 0.014985481277108192 + "A": 0.019179031252861023, + "B": 0.9240975379943848, + "C": 0.004849218763411045, + "D": 0.007055571302771568 }, "sample": { "messages": [ @@ -41587,10 +41587,10 @@ ] }, "predict": { - "A": 0.00037549069384112954, - "B": 0.0003527408407535404, - "C": 0.00022774659737478942, - "D": 0.9877981543540955 + "A": 0.0003074755659326911, + "B": 0.00013644144928548485, + "C": 0.00022495391021948308, + "D": 0.9756854772567749 }, "sample": { "messages": [ @@ -41632,10 +41632,10 @@ ] }, "predict": { - "A": 0.011748179793357849, - "B": 0.05966217815876007, - "C": 0.7268341183662415, - "D": 0.18377244472503662 + "A": 0.004968674387782812, + "B": 0.019651487469673157, + "C": 0.946861743927002, + "D": 0.022268055006861687 }, "sample": { "messages": [ @@ -41677,10 +41677,10 @@ ] }, "predict": { - "A": 0.14205536246299744, - "B": 0.38614651560783386, - "C": 0.004289700649678707, - "D": 0.4375613331794739 + "A": 0.12452346086502075, + "B": 0.3835592567920685, + "C": 0.00547117879614234, + "D": 0.43462955951690674 }, "sample": { "messages": [ @@ -41722,10 +41722,10 @@ ] }, "predict": { - "A": 0.777072012424469, - "B": 0.1964745670557022, - "C": 0.003175715683028102, - "D": 0.011084336787462234 + "A": 0.962291419506073, + "B": 0.012113461270928383, + "C": 0.0005322283250279725, + "D": 0.001857659430243075 }, "sample": { "messages": [ @@ -41767,10 +41767,10 @@ ] }, "predict": { - "A": 0.009438795037567616, - "B": 0.017633991315960884, - "C": 0.9627832174301147, - "D": 0.001127303228713572 + "A": 0.004543025977909565, + "B": 0.003538111923262477, + "C": 0.9810203909873962, + "D": 0.00010684172593755648 }, "sample": { "messages": [ @@ -41812,10 +41812,10 @@ ] }, "predict": { - "A": 0.0021382984705269337, - "B": 0.009583190083503723, - "C": 0.9775118827819824, - "D": 0.0001988927397178486 + "A": 0.0009566330700181425, + "B": 0.0010840073227882385, + "C": 0.9855153560638428, + "D": 7.376763096544892e-05 }, "sample": { "messages": [ @@ -41850,17 +41850,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "B" + "D" ] }, "predict": { - "A": 0.15105748176574707, - "B": 0.5272424221038818, - "C": 0.049041181802749634, - "D": 0.24905169010162354 + "A": 0.19606457650661469, + "B": 0.13475309312343597, + "C": 0.02341657504439354, + "D": 0.6039214134216309 }, "sample": { "messages": [ @@ -41890,7 +41890,7 @@ "prompt_len": 106, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -41902,10 +41902,10 @@ ] }, "predict": { - "A": 0.16597169637680054, - "B": 0.04755168408155441, - "C": 0.022461825981736183, - "D": 0.7438335418701172 + "A": 0.06859152019023895, + "B": 0.05341913178563118, + "C": 0.07772437483072281, + "D": 0.7374283671379089 }, "sample": { "messages": [ @@ -41947,10 +41947,10 @@ ] }, "predict": { - "A": 0.05685751512646675, - "B": 0.020916711539030075, - "C": 0.012686626054346561, - "D": 0.8894011974334717 + "A": 0.013303583487868309, + "B": 0.0015888867201283574, + "C": 0.013303583487868309, + "D": 0.9326532483100891 }, "sample": { "messages": [ @@ -41992,10 +41992,10 @@ ] }, "predict": { - "A": 0.05758166313171387, - "B": 0.07393631339073181, - "C": 0.7948903441429138, - "D": 0.039575256407260895 + "A": 0.07145768404006958, + "B": 0.09175348281860352, + "C": 0.7682424783706665, + "D": 0.010958390310406685 }, "sample": { "messages": [ @@ -42037,10 +42037,10 @@ ] }, "predict": { - "A": 0.9248842000961304, - "B": 0.013192763552069664, - "C": 0.004853346850723028, - "D": 0.0008433852344751358 + "A": 0.8878573179244995, + "B": 0.001334839384071529, + "C": 0.004111595451831818, + "D": 0.00015942382742650807 }, "sample": { "messages": [ @@ -42082,10 +42082,10 @@ ] }, "predict": { - "A": 0.0014701562467962503, - "B": 0.001010422594845295, - "C": 0.0006128522800281644, - "D": 0.9778621196746826 + "A": 0.0005993933300487697, + "B": 0.00013374275295063853, + "C": 0.00024986432981677353, + "D": 0.9563872218132019 }, "sample": { "messages": [ @@ -42127,10 +42127,10 @@ ] }, "predict": { - "A": 0.019174862653017044, - "B": 0.006225166376680136, - "C": 0.001081771682947874, - "D": 0.9238966107368469 + "A": 0.00691206706687808, + "B": 0.0008787707192823291, + "C": 0.0001434553851140663, + "D": 0.9053021669387817 }, "sample": { "messages": [ @@ -42168,14 +42168,14 @@ "acc": false, "f1_macro": [ "C", - "B" + "A" ] }, "predict": { - "A": 0.15281815826892853, - "B": 0.7760747075080872, - "C": 0.008621412329375744, - "D": 0.018251528963446617 + "A": 0.8294015526771545, + "B": 0.028380559757351875, + "C": 0.0063325585797429085, + "D": 0.004931801930069923 }, "sample": { "messages": [ @@ -42205,7 +42205,7 @@ "prompt_len": 59, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -42217,10 +42217,10 @@ ] }, "predict": { - "A": 0.0007960146758705378, - "B": 0.0011581945000216365, - "C": 0.9891662001609802, - "D": 0.00017761487106326967 + "A": 0.0004257464606780559, + "B": 0.0002748825936578214, + "C": 0.9884012937545776, + "D": 6.529033271363005e-05 }, "sample": { "messages": [ @@ -42255,17 +42255,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.16175740957260132, - "B": 0.8214719891548157, - "C": 0.000400956574594602, - "D": 0.00027557314024306834 + "A": 0.5378518104553223, + "B": 0.4188793897628784, + "C": 0.00027945416513830423, + "D": 0.0002044530410785228 }, "sample": { "messages": [ @@ -42295,7 +42295,7 @@ "prompt_len": 62, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -42307,10 +42307,10 @@ ] }, "predict": { - "A": 0.0009028420317918062, - "B": 0.0024541791062802076, - "C": 0.9900864958763123, - "D": 0.00016700898413546383 + "A": 0.00029410567367449403, + "B": 0.00031307386234402657, + "C": 0.9934501051902771, + "D": 2.5698667741380632e-05 }, "sample": { "messages": [ @@ -42352,10 +42352,10 @@ ] }, "predict": { - "A": 0.9439921975135803, - "B": 0.01346532441675663, - "C": 0.0030045202001929283, - "D": 0.007207468617707491 + "A": 0.9524819850921631, + "B": 0.0016226673033088446, + "C": 0.0005969458725303411, + "D": 0.0072722891345620155 }, "sample": { "messages": [ @@ -42397,10 +42397,10 @@ ] }, "predict": { - "A": 0.0048427097499370575, - "B": 0.8144185543060303, - "C": 0.002287534298375249, - "D": 0.16036851704120636 + "A": 0.009271972812712193, + "B": 0.834636390209198, + "C": 0.0016112272860482335, + "D": 0.11295574903488159 }, "sample": { "messages": [ @@ -42442,10 +42442,10 @@ ] }, "predict": { - "A": 0.0031257241498678923, - "B": 0.0007903068326413631, - "C": 8.866997086443007e-05, - "D": 0.9820733070373535 + "A": 0.001619044691324234, + "B": 0.00017064603161998093, + "C": 2.4583894628449343e-05, + "D": 0.950355589389801 }, "sample": { "messages": [ @@ -42487,10 +42487,10 @@ ] }, "predict": { - "A": 0.013305106200277805, - "B": 0.9327600002288818, - "C": 0.011741715483367443, - "D": 0.010362028144299984 + "A": 0.010975910350680351, + "B": 0.8719245195388794, + "C": 0.033808182924985886, + "D": 0.002954126102849841 }, "sample": { "messages": [ @@ -42532,10 +42532,10 @@ ] }, "predict": { - "A": 0.0008873082115314901, - "B": 0.005106106400489807, - "C": 0.0002542180591262877, - "D": 0.9730516076087952 + "A": 0.00034100128686986864, + "B": 0.0017317475285381079, + "C": 8.099490514723584e-05, + "D": 0.9549233317375183 }, "sample": { "messages": [ @@ -42577,10 +42577,10 @@ ] }, "predict": { - "A": 0.00013921713980380446, - "B": 0.00013921713980380446, - "C": 0.9955343008041382, - "D": 0.0006239279173314571 + "A": 2.4174369173124433e-05, + "B": 3.3042440918507054e-05, + "C": 0.9947959780693054, + "D": 0.0001391138939652592 }, "sample": { "messages": [ @@ -42622,10 +42622,10 @@ ] }, "predict": { - "A": 0.10327335447072983, - "B": 0.11702404916286469, - "C": 0.7630926370620728, - "D": 0.001147263334132731 + "A": 0.05811304599046707, + "B": 0.01469327975064516, + "C": 0.9090410470962524, + "D": 0.0002528118493501097 }, "sample": { "messages": [ @@ -42667,10 +42667,10 @@ ] }, "predict": { - "A": 0.18860413134098053, - "B": 0.1296255886554718, - "C": 0.00502612954005599, - "D": 0.6582931280136108 + "A": 0.08976572751998901, + "B": 0.02269633114337921, + "C": 0.004757409915328026, + "D": 0.8516735434532166 }, "sample": { "messages": [ @@ -42712,10 +42712,10 @@ ] }, "predict": { - "A": 0.0027405291330069304, - "B": 0.9756959080696106, - "C": 0.00146689941175282, - "D": 0.00146689941175282 + "A": 0.0010304575553163886, + "B": 0.9368310570716858, + "C": 0.0014084684662520885, + "D": 0.0009093755506910384 }, "sample": { "messages": [ @@ -42757,10 +42757,10 @@ ] }, "predict": { - "A": 0.0365171916782856, - "B": 0.004942064639180899, - "C": 0.0001916246401378885, - "D": 0.9417908191680908 + "A": 0.005040560383349657, + "B": 0.0006821656716056168, + "C": 5.599556607194245e-05, + "D": 0.9605607986450195 }, "sample": { "messages": [ @@ -42802,10 +42802,10 @@ ] }, "predict": { - "A": 0.6593335270881653, - "B": 0.24255527555942535, - "C": 0.028969094157218933, - "D": 0.009404887445271015 + "A": 0.7048690319061279, + "B": 0.014629089273512363, + "C": 0.1782187968492508, + "D": 0.002243443625047803 }, "sample": { "messages": [ @@ -42847,10 +42847,10 @@ ] }, "predict": { - "A": 0.013798890635371208, - "B": 0.006518134381622076, - "C": 0.9673769474029541, - "D": 0.0004166903963778168 + "A": 0.002124237362295389, + "B": 0.0007814632845111191, + "C": 0.9710838794708252, + "D": 9.935189882526174e-05 }, "sample": { "messages": [ @@ -42892,10 +42892,10 @@ ] }, "predict": { - "A": 0.6481305360794067, - "B": 0.05320179462432861, - "C": 0.05320179462432861, - "D": 0.21041718125343323 + "A": 0.5455950498580933, + "B": 0.006061013787984848, + "C": 0.004165669437497854, + "D": 0.4249098598957062 }, "sample": { "messages": [ @@ -42933,14 +42933,14 @@ "acc": false, "f1_macro": [ "B", - "C" + "A" ] }, "predict": { - "A": 0.39843112230300903, - "B": 0.11415243148803711, - "C": 0.4514816105365753, - "D": 0.005683314986526966 + "A": 0.5296376943588257, + "B": 0.07167866080999374, + "C": 0.2501831352710724, + "D": 0.003352455561980605 }, "sample": { "messages": [ @@ -42970,22 +42970,22 @@ "prompt_len": 108, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "D" + "A" ] }, "predict": { - "A": 0.19918830692768097, - "B": 0.07327727228403091, - "C": 0.2257099151611328, - "D": 0.42168164253234863 + "A": 0.4640425145626068, + "B": 0.033615127205848694, + "C": 0.193441703915596, + "D": 0.21919816732406616 }, "sample": { "messages": [ @@ -43015,7 +43015,7 @@ "prompt_len": 60, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -43027,10 +43027,10 @@ ] }, "predict": { - "A": 0.03660201281309128, - "B": 0.008167012594640255, - "C": 0.9439783692359924, - "D": 0.0005916163790971041 + "A": 0.022593451663851738, + "B": 0.004448914434760809, + "C": 0.9606979489326477, + "D": 0.00041381348273716867 }, "sample": { "messages": [ @@ -43072,10 +43072,10 @@ ] }, "predict": { - "A": 0.001687139505520463, - "B": 0.0019117793999612331, - "C": 0.9903262257575989, - "D": 0.00013009822578169405 + "A": 0.0008493610075674951, + "B": 0.00020174092787783593, + "C": 0.9915100336074829, + "D": 3.731831748154946e-05 }, "sample": { "messages": [ @@ -43117,10 +43117,10 @@ ] }, "predict": { - "A": 0.005179411731660366, - "B": 0.0016815089620649815, - "C": 0.9870211482048035, - "D": 0.0009000467834994197 + "A": 0.0015836403472349048, + "B": 0.00031183729879558086, + "C": 0.9895261526107788, + "D": 0.00020133727230131626 }, "sample": { "messages": [ @@ -43162,10 +43162,10 @@ ] }, "predict": { - "A": 0.0003743151610251516, - "B": 0.0002572627563495189, - "C": 5.392520688474178e-05, - "D": 0.9847056269645691 + "A": 0.00029362697387114167, + "B": 0.00012240193609613925, + "C": 3.2944029953796417e-05, + "D": 0.931740939617157 }, "sample": { "messages": [ @@ -43207,10 +43207,10 @@ ] }, "predict": { - "A": 0.031946536153554916, - "B": 0.0048991600051522255, - "C": 0.11150436848402023, - "D": 0.8239120841026306 + "A": 0.18087944388389587, + "B": 0.001564913778565824, + "C": 0.035617273300886154, + "D": 0.7153920531272888 }, "sample": { "messages": [ @@ -43252,10 +43252,10 @@ ] }, "predict": { - "A": 0.005033708177506924, - "B": 0.0034596137702465057, - "C": 0.006463408935815096, - "D": 0.9592549800872803 + "A": 0.002447202568873763, + "B": 0.0007463545771315694, + "C": 0.0006586555973626673, + "D": 0.9274561405181885 }, "sample": { "messages": [ @@ -43297,10 +43297,10 @@ ] }, "predict": { - "A": 0.04337058216333389, - "B": 0.049145303666591644, - "C": 0.5283618569374084, - "D": 0.363137423992157 + "A": 0.00632859580218792, + "B": 0.004928715527057648, + "C": 0.7314862012863159, + "D": 0.2374788075685501 }, "sample": { "messages": [ @@ -43342,10 +43342,10 @@ ] }, "predict": { - "A": 0.013362176716327667, - "B": 0.024963833391666412, - "C": 0.008104570209980011, - "D": 0.9367609620094299 + "A": 0.011813649907708168, + "B": 0.00716534024104476, + "C": 0.004345999099314213, + "D": 0.9384744167327881 }, "sample": { "messages": [ @@ -43387,10 +43387,10 @@ ] }, "predict": { - "A": 0.024173671379685402, - "B": 0.0351724848151207, - "C": 0.003271550638601184, - "D": 0.907110333442688 + "A": 0.017269661650061607, + "B": 0.003619917668402195, + "C": 0.0016063288785517216, + "D": 0.942891538143158 }, "sample": { "messages": [ @@ -43432,10 +43432,10 @@ ] }, "predict": { - "A": 0.16676095128059387, - "B": 0.06951630860567093, - "C": 0.747370719909668, - "D": 0.004444028716534376 + "A": 0.020139796659350395, + "B": 0.004493796266615391, + "C": 0.9703897833824158, + "D": 0.00057132204528898 }, "sample": { "messages": [ @@ -43477,10 +43477,10 @@ ] }, "predict": { - "A": 0.7485445141792297, - "B": 0.061444275081157684, - "C": 0.1300775110721588, - "D": 0.013710071332752705 + "A": 0.8148160576820374, + "B": 0.013170274905860424, + "C": 0.0405673012137413, + "D": 0.003329967148602009 }, "sample": { "messages": [ @@ -43522,10 +43522,10 @@ ] }, "predict": { - "A": 0.08684740960597992, - "B": 0.7271644473075867, - "C": 0.12636221945285797, - "D": 0.007128869649022818 + "A": 0.11468815058469772, + "B": 0.6599847674369812, + "C": 0.11468815058469772, + "D": 0.004177515860646963 }, "sample": { "messages": [ @@ -43567,10 +43567,10 @@ ] }, "predict": { - "A": 0.000623747066128999, - "B": 0.000428694678703323, - "C": 0.9952457547187805, - "D": 6.57424534438178e-05 + "A": 0.0002767136029433459, + "B": 4.517225534073077e-05, + "C": 0.9949851036071777, + "D": 1.0079291314468719e-05 }, "sample": { "messages": [ @@ -43612,10 +43612,10 @@ ] }, "predict": { - "A": 0.8625843524932861, - "B": 0.01230410486459732, - "C": 0.029516013339161873, - "D": 0.07080523669719696 + "A": 0.865818202495575, + "B": 0.0010791514068841934, + "C": 0.005833837203681469, + "D": 0.08053363859653473 }, "sample": { "messages": [ @@ -43657,10 +43657,10 @@ ] }, "predict": { - "A": 0.025662416592240334, - "B": 0.0018589792307466269, - "C": 0.0006035221740603447, - "D": 0.9629750847816467 + "A": 0.010751275345683098, + "B": 0.00036788839497603476, + "C": 0.00015335869102273136, + "D": 0.9677988886833191 }, "sample": { "messages": [ @@ -43702,10 +43702,10 @@ ] }, "predict": { - "A": 0.0008476068032905459, - "B": 0.07629913091659546, - "C": 0.0864582434296608, - "D": 0.8202929496765137 + "A": 0.0003453651734162122, + "B": 0.004207409452646971, + "C": 0.027435753494501114, + "D": 0.9085474014282227 }, "sample": { "messages": [ @@ -43747,10 +43747,10 @@ ] }, "predict": { - "A": 0.02823955938220024, - "B": 0.009168042801320553, - "C": 0.010388754308223724, - "D": 0.9351657629013062 + "A": 0.007272658403962851, + "B": 0.001345306634902954, + "C": 0.0007665326120331883, + "D": 0.9525303244590759 }, "sample": { "messages": [ @@ -43792,10 +43792,10 @@ ] }, "predict": { - "A": 0.01030799001455307, - "B": 0.0023000233341008425, - "C": 0.0010864540236070752, - "D": 0.9278956055641174 + "A": 0.014859357848763466, + "B": 0.0009499269654043019, + "C": 0.0005761598004028201, + "D": 0.9193159341812134 }, "sample": { "messages": [ @@ -43837,10 +43837,10 @@ ] }, "predict": { - "A": 0.06511642783880234, - "B": 0.0186561681330204, - "C": 0.89890456199646, - "D": 0.0008725606021471322 + "A": 0.07263068854808807, + "B": 0.008674499578773975, + "C": 0.8848229050636292, + "D": 0.0004318778810556978 }, "sample": { "messages": [ @@ -43882,10 +43882,10 @@ ] }, "predict": { - "A": 0.0024237283505499363, - "B": 0.0008916397928260267, - "C": 0.00037169078132137656, - "D": 0.9778017401695251 + "A": 0.001168315066024661, + "B": 0.00020302268967498094, + "C": 0.00020302268967498094, + "D": 0.9373553991317749 }, "sample": { "messages": [ @@ -43927,10 +43927,10 @@ ] }, "predict": { - "A": 0.9529586434364319, - "B": 0.011995978653430939, - "C": 0.0009846899192780256, - "D": 0.0003196820034645498 + "A": 0.9124509692192078, + "B": 0.0019959784112870693, + "C": 0.00021037456463091075, + "D": 4.9968341045314446e-05 }, "sample": { "messages": [ @@ -43972,10 +43972,10 @@ ] }, "predict": { - "A": 0.19029264152050018, - "B": 0.7526220083236694, - "C": 0.02272721566259861, - "D": 0.0073784468695521355 + "A": 0.2792985141277313, + "B": 0.5912749171257019, + "C": 0.04283181577920914, + "D": 0.01390545442700386 }, "sample": { "messages": [ @@ -44017,10 +44017,10 @@ ] }, "predict": { - "A": 0.02419106476008892, - "B": 0.03988432511687279, - "C": 0.9077630043029785, - "D": 0.016626259312033653 + "A": 0.01556149311363697, + "B": 0.007350728381425142, + "C": 0.9627555012702942, + "D": 0.005724753253161907 }, "sample": { "messages": [ @@ -44062,10 +44062,10 @@ ] }, "predict": { - "A": 0.17268106341362, - "B": 0.026481498032808304, - "C": 0.007127398159354925, - "D": 0.7739028334617615 + "A": 0.3064759373664856, + "B": 0.00925477035343647, + "C": 0.002651536138728261, + "D": 0.6488094925880432 }, "sample": { "messages": [ @@ -44107,10 +44107,10 @@ ] }, "predict": { - "A": 0.22622931003570557, - "B": 0.034693386405706406, - "C": 0.614954948425293, - "D": 0.05719972029328346 + "A": 0.16426339745521545, + "B": 0.017313234508037567, + "C": 0.7361774444580078, + "D": 0.004960324615240097 }, "sample": { "messages": [ @@ -44152,10 +44152,10 @@ ] }, "predict": { - "A": 0.0012671954464167356, - "B": 0.9550905227661133, - "C": 0.0003630575956776738, - "D": 0.0008709298563189805 + "A": 0.0019270657794550061, + "B": 0.8275740146636963, + "C": 0.00040393497329205275, + "D": 0.0011688244994729757 }, "sample": { "messages": [ @@ -44197,10 +44197,10 @@ ] }, "predict": { - "A": 0.0011460100067779422, - "B": 0.0033160902094095945, - "C": 0.0001285787730012089, - "D": 0.9787599444389343 + "A": 0.0003906664496753365, + "B": 0.0003238738572690636, + "C": 6.377453973982483e-05, + "D": 0.9654543399810791 }, "sample": { "messages": [ @@ -44242,10 +44242,10 @@ ] }, "predict": { - "A": 0.005871692206710577, - "B": 0.0016822682227939367, - "C": 0.9874668717384338, - "D": 0.0007946472032926977 + "A": 0.0019125064136460423, + "B": 0.0002588295901659876, + "C": 0.9907028675079346, + "D": 9.521808533463627e-05 }, "sample": { "messages": [ @@ -44287,10 +44287,10 @@ ] }, "predict": { - "A": 0.0030671784188598394, - "B": 0.963678777217865, - "C": 0.0003232782182749361, - "D": 0.00025176932103931904 + "A": 0.0036004739813506603, + "B": 0.8810069561004639, + "C": 0.0003564952057786286, + "D": 0.0003564952057786286 }, "sample": { "messages": [ @@ -44332,10 +44332,10 @@ ] }, "predict": { - "A": 0.0003337286179885268, - "B": 0.0009071684326045215, - "C": 0.9948310256004333, - "D": 6.173358269734308e-05 + "A": 0.00010181441757595167, + "B": 5.119550769450143e-05, + "C": 0.9951529502868652, + "D": 2.134146234311629e-05 }, "sample": { "messages": [ @@ -44377,10 +44377,10 @@ ] }, "predict": { - "A": 6.504853081423789e-05, - "B": 0.984740674495697, - "C": 3.481797102722339e-05, - "D": 0.0007924533565528691 + "A": 4.064514359924942e-05, + "B": 0.8952687978744507, + "C": 2.315890378667973e-05, + "D": 0.00072045240085572 }, "sample": { "messages": [ @@ -44422,10 +44422,10 @@ ] }, "predict": { - "A": 0.04344900697469711, - "B": 0.5997950434684753, - "C": 0.17184415459632874, - "D": 0.15165193378925323 + "A": 0.12249653786420822, + "B": 0.3773159086704254, + "C": 0.08419056236743927, + "D": 0.3773159086704254 }, "sample": { "messages": [ @@ -44455,7 +44455,7 @@ "prompt_len": 67, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -44467,10 +44467,10 @@ ] }, "predict": { - "A": 0.9649026393890381, - "B": 0.004468386061489582, - "C": 0.0003236887860111892, - "D": 0.0003236887860111892 + "A": 0.9385399222373962, + "B": 0.0009697910863906145, + "C": 0.00011582503066165373, + "D": 0.00017939330427907407 }, "sample": { "messages": [ @@ -44512,10 +44512,10 @@ ] }, "predict": { - "A": 0.40566328167915344, - "B": 0.1691056191921234, - "C": 0.3579965829849243, - "D": 0.002128725638613105 + "A": 0.4116973578929901, + "B": 0.02049720473587513, + "C": 0.3633216321468353, + "D": 0.0005814609467051923 }, "sample": { "messages": [ @@ -44557,10 +44557,10 @@ ] }, "predict": { - "A": 0.09002221375703812, - "B": 0.04818542301654816, - "C": 0.314208447933197, - "D": 0.5180421471595764 + "A": 0.04318961501121521, + "B": 0.015888571739196777, + "C": 0.21933495998382568, + "D": 0.6755992770195007 }, "sample": { "messages": [ @@ -44602,10 +44602,10 @@ ] }, "predict": { - "A": 0.025146719068288803, - "B": 0.002339009428396821, - "C": 0.0002793548337649554, - "D": 0.9436237215995789 + "A": 0.005115446634590626, + "B": 0.0003944608906749636, + "C": 0.0001644357544137165, + "D": 0.8602858781814575 }, "sample": { "messages": [ @@ -44647,10 +44647,10 @@ ] }, "predict": { - "A": 0.015368135645985603, - "B": 0.006406391970813274, - "C": 0.0056536211632192135, - "D": 0.9507929086685181 + "A": 0.002080610953271389, + "B": 0.001429981435649097, + "C": 0.002357641002163291, + "D": 0.9511402249336243 }, "sample": { "messages": [ @@ -44692,10 +44692,10 @@ ] }, "predict": { - "A": 0.0024253251031041145, - "B": 0.9784458875656128, - "C": 0.0002556273539084941, - "D": 0.0005411631427705288 + "A": 0.0022729800548404455, + "B": 0.9169855713844299, + "C": 0.00019861076725646853, + "D": 0.0004204590222798288 }, "sample": { "messages": [ @@ -44737,10 +44737,10 @@ ] }, "predict": { - "A": 0.6573677659034729, - "B": 0.032728411257267, - "C": 0.012040110304951668, - "D": 0.2740316390991211 + "A": 0.5312893986701965, + "B": 0.007119282148778439, + "C": 0.006687946617603302, + "D": 0.41376858949661255 }, "sample": { "messages": [ @@ -44782,10 +44782,10 @@ ] }, "predict": { - "A": 0.1265111118555069, - "B": 0.8249560594558716, - "C": 0.009164434857666492, - "D": 0.013334172777831554 + "A": 0.09884284436702728, + "B": 0.8276009559631348, + "C": 0.0033822159748524427, + "D": 0.0049210949800908566 }, "sample": { "messages": [ @@ -44827,10 +44827,10 @@ ] }, "predict": { - "A": 0.024431375786662102, - "B": 0.9167806506156921, - "C": 0.005451376549899578, - "D": 0.040280532091856 + "A": 0.030881965532898903, + "B": 0.9025033712387085, + "C": 0.0078081837855279446, + "D": 0.018730858340859413 }, "sample": { "messages": [ @@ -44872,10 +44872,10 @@ ] }, "predict": { - "A": 0.001915070810355246, - "B": 0.0013162075774744153, - "C": 0.9920312166213989, - "D": 8.414233161602169e-05 + "A": 0.0009043327299878001, + "B": 0.00016728472837712616, + "C": 0.9917212128639221, + "D": 3.5064786061411723e-05 }, "sample": { "messages": [ @@ -44917,10 +44917,10 @@ ] }, "predict": { - "A": 0.65186607837677, - "B": 0.047221019864082336, - "C": 0.0030187389347702265, - "D": 0.06870617717504501 + "A": 0.5726515650749207, + "B": 0.005614078603684902, + "C": 0.0034051104448735714, + "D": 0.019595056772232056 }, "sample": { "messages": [ @@ -44962,10 +44962,10 @@ ] }, "predict": { - "A": 0.0007759287836961448, - "B": 0.9642064571380615, - "C": 0.00011899253149749711, - "D": 0.0006847547483630478 + "A": 0.0009872756199911237, + "B": 0.8975726366043091, + "C": 8.104052540147677e-05, + "D": 0.0006374330841936171 }, "sample": { "messages": [ @@ -45007,10 +45007,10 @@ ] }, "predict": { - "A": 0.6198377013206482, - "B": 0.10771163552999496, - "C": 0.22802558541297913, - "D": 0.002533135237172246 + "A": 0.6740114092826843, + "B": 0.01796179637312889, + "C": 0.17041677236557007, + "D": 0.0005773809971287847 }, "sample": { "messages": [ @@ -45052,10 +45052,10 @@ ] }, "predict": { - "A": 0.03188832104206085, - "B": 0.9319133758544922, - "C": 0.0024589633103460073, - "D": 0.013293029740452766 + "A": 0.03988667577505112, + "B": 0.9078165292739868, + "C": 0.0012044731993228197, + "D": 0.01008493173867464 }, "sample": { "messages": [ @@ -45097,10 +45097,10 @@ ] }, "predict": { - "A": 0.0009017345146276057, - "B": 0.0010217990493401885, - "C": 0.9888719916343689, - "D": 0.00013828548253513873 + "A": 0.00025779721909202635, + "B": 0.00011439683294156566, + "C": 0.9867513179779053, + "D": 4.208423706586473e-05 }, "sample": { "messages": [ @@ -45142,10 +45142,10 @@ ] }, "predict": { - "A": 0.003350913990288973, - "B": 0.7235958576202393, - "C": 0.23491717875003815, - "D": 0.008038424886763096 + "A": 0.0036456456873565912, + "B": 0.5410618185997009, + "C": 0.37186598777770996, + "D": 0.004681101534515619 }, "sample": { "messages": [ @@ -45187,10 +45187,10 @@ ] }, "predict": { - "A": 0.003149386728182435, - "B": 0.0010224560974165797, - "C": 0.9895078539848328, - "D": 0.00013837439473718405 + "A": 0.0024380874820053577, + "B": 0.0002267774543724954, + "C": 0.9835947155952454, + "D": 4.465513120521791e-05 }, "sample": { "messages": [ @@ -45232,10 +45232,10 @@ ] }, "predict": { - "A": 0.017277956008911133, - "B": 0.0034022312611341476, - "C": 0.9433444142341614, - "D": 0.02848653309047222 + "A": 0.007482615765184164, + "B": 0.00047834767610765994, + "C": 0.9800293445587158, + "D": 0.006603385787457228 }, "sample": { "messages": [ @@ -45277,10 +45277,10 @@ ] }, "predict": { - "A": 0.007505442015826702, - "B": 0.6756182909011841, - "C": 0.001674690400250256, - "D": 0.28163960576057434 + "A": 0.005612601991742849, + "B": 0.6487285494804382, + "C": 0.0009162321221083403, + "D": 0.23865392804145813 }, "sample": { "messages": [ @@ -45322,10 +45322,10 @@ ] }, "predict": { - "A": 0.8660029172897339, - "B": 0.017973316833376884, - "C": 0.004010389093309641, - "D": 0.08055081218481064 + "A": 0.9141305088996887, + "B": 0.0021286187693476677, + "C": 0.0012910725781694055, + "D": 0.014775543473660946 }, "sample": { "messages": [ @@ -45367,10 +45367,10 @@ ] }, "predict": { - "A": 0.1281999796628952, - "B": 0.009286775253713131, - "C": 0.004120981320738792, - "D": 0.8359688520431519 + "A": 0.2578527629375458, + "B": 0.0019687300082296133, + "C": 0.0017373980954289436, + "D": 0.7009164094924927 }, "sample": { "messages": [ @@ -45412,10 +45412,10 @@ ] }, "predict": { - "A": 0.2242530733346939, - "B": 0.6095829606056213, - "C": 0.056700047105550766, - "D": 0.07280430942773819 + "A": 0.29876676201820374, + "B": 0.5581697821617126, + "C": 0.0314897857606411, + "D": 0.06666387617588043 }, "sample": { "messages": [ @@ -45457,10 +45457,10 @@ ] }, "predict": { - "A": 0.4639817476272583, - "B": 0.36134934425354004, - "C": 0.0711539089679718, - "D": 0.0806279331445694 + "A": 0.5912370681762695, + "B": 0.04853169247508049, + "C": 0.11642147600650787, + "D": 0.19194656610488892 }, "sample": { "messages": [ @@ -45502,10 +45502,10 @@ ] }, "predict": { - "A": 0.9516857266426086, - "B": 0.003432314610108733, - "C": 0.0018371857004240155, - "D": 0.0038893220480531454 + "A": 0.9138096570968628, + "B": 0.00023874056932982057, + "C": 0.00023874056932982057, + "D": 0.00036976864794269204 }, "sample": { "messages": [ @@ -45540,17 +45540,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.0097762206569314, - "B": 0.8800272345542908, - "C": 0.0722370371222496, - "D": 0.0028009340167045593 + "A": 0.004419445060193539, + "B": 0.21294079720973969, + "C": 0.7432363629341125, + "D": 0.0005618694121949375 }, "sample": { "messages": [ @@ -45580,7 +45580,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -45592,10 +45592,10 @@ ] }, "predict": { - "A": 0.9216262698173523, - "B": 0.016880173236131668, - "C": 0.002284483052790165, - "D": 0.03153631463646889 + "A": 0.9445217847824097, + "B": 0.0014200310688465834, + "C": 0.0010389168746769428, + "D": 0.010492689907550812 }, "sample": { "messages": [ @@ -45630,17 +45630,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "D" + "A" ] }, "predict": { - "A": 0.42339497804641724, - "B": 0.05056731775403023, - "C": 0.023886309936642647, - "D": 0.47976934909820557 + "A": 0.6919320821762085, + "B": 0.018439365550875664, + "C": 0.03040137328207493, + "D": 0.2246374487876892 }, "sample": { "messages": [ @@ -45670,7 +45670,7 @@ "prompt_len": 108, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -45682,10 +45682,10 @@ ] }, "predict": { - "A": 0.03946331888437271, - "B": 0.0186411514878273, - "C": 0.012811862863600254, - "D": 0.8981809616088867 + "A": 0.016632506623864174, + "B": 0.0028902965132147074, + "C": 0.0032751348335295916, + "D": 0.9081041216850281 }, "sample": { "messages": [ @@ -45720,17 +45720,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.33468565344810486, - "B": 0.4297448694705963, - "C": 0.08462177962064743, - "D": 0.12312397360801697 + "A": 0.45959988236427307, + "B": 0.19158972799777985, + "C": 0.10255060344934464, + "D": 0.16907736659049988 }, "sample": { "messages": [ @@ -45760,7 +45760,7 @@ "prompt_len": 84, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -45772,10 +45772,10 @@ ] }, "predict": { - "A": 0.015740398317575455, - "B": 0.0024138682056218386, - "C": 0.9738239645957947, - "D": 0.0002390053414274007 + "A": 0.009578008204698563, + "B": 0.000327741407090798, + "C": 0.976983368396759, + "D": 5.3502313676290214e-05 }, "sample": { "messages": [ @@ -45817,10 +45817,10 @@ ] }, "predict": { - "A": 0.0007874325383454561, - "B": 0.978501558303833, - "C": 0.00013683526776731014, - "D": 0.00019909412367269397 + "A": 0.0005049057072028518, + "B": 0.857581615447998, + "C": 0.00013589351146947592, + "D": 0.00030624077771790326 }, "sample": { "messages": [ @@ -45862,10 +45862,10 @@ ] }, "predict": { - "A": 0.019014470279216766, - "B": 0.9161684513092041, - "C": 0.0029159635305404663, - "D": 0.027665890753269196 + "A": 0.02692292258143425, + "B": 0.8915647864341736, + "C": 0.003878614865243435, + "D": 0.023759394884109497 }, "sample": { "messages": [ @@ -45907,10 +45907,10 @@ ] }, "predict": { - "A": 0.9515626430511475, - "B": 0.010570907033979893, - "C": 0.003431870834901929, - "D": 0.006411578506231308 + "A": 0.8999912738800049, + "B": 0.0015332431066781282, + "C": 0.0005298745818436146, + "D": 0.0019687230233103037 }, "sample": { "messages": [ @@ -45952,10 +45952,10 @@ ] }, "predict": { - "A": 0.896499752998352, - "B": 0.016419965773820877, - "C": 0.03067653253674507, - "D": 0.00604056753218174 + "A": 0.8044078350067139, + "B": 0.0022594162728637457, + "C": 0.008936164900660515, + "D": 0.002560253953561187 }, "sample": { "messages": [ @@ -45997,10 +45997,10 @@ ] }, "predict": { - "A": 0.003483326407149434, - "B": 0.001130870427004993, - "C": 0.00041602400597184896, - "D": 0.9658298492431641 + "A": 0.0014274884015321732, + "B": 0.0002189124352298677, + "C": 0.00020564919395837933, + "D": 0.9494820237159729 }, "sample": { "messages": [ @@ -46042,10 +46042,10 @@ ] }, "predict": { - "A": 0.26504209637641907, - "B": 0.07593583315610886, - "C": 0.0029443521052598953, - "D": 0.6358029246330261 + "A": 0.35254815220832825, + "B": 0.017552340403199196, + "C": 0.001353491097688675, + "D": 0.5812536478042603 }, "sample": { "messages": [ @@ -46087,10 +46087,10 @@ ] }, "predict": { - "A": 0.6877145767211914, - "B": 0.22326822578907013, - "C": 0.0036087948828935623, - "D": 0.049817878752946854 + "A": 0.8982782959938049, + "B": 0.016452541574835777, + "C": 0.0005288660759106278, + "D": 0.007771630305796862 }, "sample": { "messages": [ @@ -46132,10 +46132,10 @@ ] }, "predict": { - "A": 0.014190619811415672, - "B": 0.09253446757793427, - "C": 0.002794299041852355, - "D": 0.8779425621032715 + "A": 0.009042885154485703, + "B": 0.02458110824227333, + "C": 0.006215077359229326, + "D": 0.9223992824554443 }, "sample": { "messages": [ @@ -46177,10 +46177,10 @@ ] }, "predict": { - "A": 0.007479383610188961, - "B": 0.0031178712379187346, - "C": 0.9796059727668762, - "D": 0.0016688762698322535 + "A": 0.00953083299100399, + "B": 0.000537692743819207, + "C": 0.9721713662147522, + "D": 0.0006092857220210135 }, "sample": { "messages": [ @@ -46222,10 +46222,10 @@ ] }, "predict": { - "A": 0.0322904996573925, - "B": 0.0056112478487193584, - "C": 0.9436668157577515, - "D": 0.0056112478487193584 + "A": 0.046047642827034, + "B": 0.0015756638022139668, + "C": 0.9248916506767273, + "D": 0.0013905182713642716 }, "sample": { "messages": [ @@ -46267,10 +46267,10 @@ ] }, "predict": { - "A": 0.7571605443954468, - "B": 0.042716074734926224, - "C": 0.14909373223781586, - "D": 0.022864267230033875 + "A": 0.8962591290473938, + "B": 0.0011891390895470977, + "C": 0.006843022536486387, + "D": 0.0007212493219412863 }, "sample": { "messages": [ @@ -46312,10 +46312,10 @@ ] }, "predict": { - "A": 0.0024281649384647608, - "B": 0.0011469838209450245, - "C": 0.00025592665770091116, - "D": 0.9795915484428406 + "A": 0.0007489615818485618, + "B": 0.00021458107221405953, + "C": 4.7879511839710176e-05, + "D": 0.9306957125663757 }, "sample": { "messages": [ @@ -46357,10 +46357,10 @@ ] }, "predict": { - "A": 0.01861531473696232, - "B": 0.0032348569948226213, - "C": 0.25697654485702515, - "D": 0.6985345482826233 + "A": 0.002432098612189293, + "B": 0.0007895868038758636, + "C": 0.09126382321119308, + "D": 0.8658870458602905 }, "sample": { "messages": [ @@ -46402,10 +46402,10 @@ ] }, "predict": { - "A": 0.0321827307343483, - "B": 0.013415757566690445, - "C": 0.940517246723175, - "D": 0.0007110091391950846 + "A": 0.03657689690589905, + "B": 0.0015097089344635606, + "C": 0.9433305859565735, + "D": 0.0001239244593307376 }, "sample": { "messages": [ @@ -46447,10 +46447,10 @@ ] }, "predict": { - "A": 0.9373570680618286, - "B": 0.005573728587478399, - "C": 0.0005184375331737101, - "D": 0.0012436669785529375 + "A": 0.9139231443405151, + "B": 0.0004748513165395707, + "C": 0.00010595365893095732, + "D": 0.00015416165115311742 }, "sample": { "messages": [ @@ -46492,10 +46492,10 @@ ] }, "predict": { - "A": 0.000612924515735358, - "B": 0.00017560581909492612, - "C": 0.0001549715962028131, - "D": 0.9779773950576782 + "A": 5.971847349428572e-05, + "B": 3.4026561479549855e-05, + "C": 3.622108488343656e-05, + "D": 0.9623576402664185 }, "sample": { "messages": [ @@ -46537,10 +46537,10 @@ ] }, "predict": { - "A": 0.9471791386604309, - "B": 0.002660431433469057, - "C": 0.0003600502386689186, - "D": 0.0014240262098610401 + "A": 0.8659180402755737, + "B": 0.00037298773531802, + "C": 0.00015548442024737597, + "D": 0.00037298773531802 }, "sample": { "messages": [ @@ -46582,10 +46582,10 @@ ] }, "predict": { - "A": 0.03987078368663788, - "B": 0.02418285235762596, - "C": 0.9074548482894897, - "D": 0.008896375074982643 + "A": 0.2073337286710739, + "B": 0.006260935682803392, + "C": 0.7236657738685608, + "D": 0.0058816056698560715 }, "sample": { "messages": [ @@ -46627,10 +46627,10 @@ ] }, "predict": { - "A": 0.9590712785720825, - "B": 0.00209795986302197, - "C": 0.010654320009052753, - "D": 5.590873843175359e-05 + "A": 0.7819416522979736, + "B": 0.0009155594743788242, + "C": 0.0183895044028759, + "D": 2.5972422008635476e-05 }, "sample": { "messages": [ @@ -46672,10 +46672,10 @@ ] }, "predict": { - "A": 0.0021445751190185547, - "B": 0.0014739434700459242, - "C": 0.0016701968852430582, - "D": 0.980381190776825 + "A": 0.0006750370957888663, + "B": 0.00023328659881372005, + "C": 0.0004938676720485091, + "D": 0.9505230188369751 }, "sample": { "messages": [ @@ -46717,10 +46717,10 @@ ] }, "predict": { - "A": 0.5450291037559509, - "B": 0.2917330861091614, - "C": 0.007774422410875559, - "D": 0.13780494034290314 + "A": 0.7918993234634399, + "B": 0.13761146366596222, + "C": 0.00996854156255722, + "D": 0.021103402599692345 }, "sample": { "messages": [ @@ -46762,10 +46762,10 @@ ] }, "predict": { - "A": 0.02021894045174122, - "B": 0.7587101459503174, - "C": 0.16929112374782562, - "D": 0.022911060601472855 + "A": 0.029466940090060234, + "B": 0.7599624395370483, + "C": 0.09076456725597382, + "D": 0.009566514752805233 }, "sample": { "messages": [ @@ -46800,17 +46800,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.2799280285835266, - "B": 0.6715124249458313, - "C": 0.02297789230942726, - "D": 0.005127060692757368 + "A": 0.5840510725975037, + "B": 0.275886207818985, + "C": 0.04230852052569389, + "D": 0.013735565356910229 }, "sample": { "messages": [ @@ -46840,7 +46840,7 @@ "prompt_len": 78, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -46852,10 +46852,10 @@ ] }, "predict": { - "A": 0.9376412034034729, - "B": 0.004342140629887581, - "C": 0.003381662303581834, - "D": 0.0004038819461129606 + "A": 0.8493525981903076, + "B": 0.0009342365083284676, + "C": 0.0012769500026479363, + "D": 0.0002362119994359091 }, "sample": { "messages": [ @@ -46897,10 +46897,10 @@ ] }, "predict": { - "A": 0.0156561192125082, - "B": 0.005759563762694597, - "C": 0.9686097502708435, - "D": 0.0010008621029555798 + "A": 0.005853093694895506, + "B": 0.0006169115076772869, + "C": 0.9843389987945557, + "D": 0.00012147708184784278 }, "sample": { "messages": [ @@ -46942,10 +46942,10 @@ ] }, "predict": { - "A": 0.0034700052347034216, - "B": 0.9621362090110779, - "C": 0.004455575253814459, - "D": 0.006482822820544243 + "A": 0.0029539174865931273, + "B": 0.9280932545661926, + "C": 0.001156770740635693, + "D": 0.0031444288324564695 }, "sample": { "messages": [ @@ -46987,10 +46987,10 @@ ] }, "predict": { - "A": 0.06332775205373764, - "B": 0.08131444454193115, - "C": 0.05588654801249504, - "D": 0.7714899778366089 + "A": 0.10217276960611343, + "B": 0.05468914285302162, + "C": 0.05468914285302162, + "D": 0.754960298538208 }, "sample": { "messages": [ @@ -47032,10 +47032,10 @@ ] }, "predict": { - "A": 0.015562133863568306, - "B": 0.022642768919467926, - "C": 0.08955389261245728, - "D": 0.8496636748313904 + "A": 0.006821826566010714, + "B": 0.0022147230338305235, + "C": 0.050406862050294876, + "D": 0.8934829831123352 }, "sample": { "messages": [ @@ -47077,10 +47077,10 @@ ] }, "predict": { - "A": 0.0018954076804220676, - "B": 0.981845498085022, - "C": 0.00019977449846919626, - "D": 0.00025651551550254226 + "A": 0.0026081495452672243, + "B": 0.928565502166748, + "C": 0.00025824178010225296, + "D": 0.0007472473080269992 }, "sample": { "messages": [ @@ -47115,17 +47115,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.02661627158522606, - "B": 0.7778414487838745, - "C": 0.17355990409851074, - "D": 0.0021847968455404043 + "A": 0.015963658690452576, + "B": 0.41170817613601685, + "C": 0.528643786907196, + "D": 0.0022997830528765917 }, "sample": { "messages": [ @@ -47155,7 +47155,7 @@ "prompt_len": 71, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -47167,10 +47167,10 @@ ] }, "predict": { - "A": 0.012089131399989128, - "B": 0.01758958213031292, - "C": 0.9603586792945862, - "D": 0.0006407005130313337 + "A": 0.013880642130970955, + "B": 0.0018785407301038504, + "C": 0.973108172416687, + "D": 0.0005056023946963251 }, "sample": { "messages": [ @@ -47212,10 +47212,10 @@ ] }, "predict": { - "A": 0.40731385350227356, - "B": 0.4615470767021179, - "C": 0.0018862374126911163, - "D": 0.0908840000629425 + "A": 0.34049341082572937, + "B": 0.43720218539237976, + "C": 0.0037825400941073895, + "D": 0.14193876087665558 }, "sample": { "messages": [ @@ -47250,17 +47250,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.09162387996912003, - "B": 0.8693031668663025, - "C": 0.01094291266053915, - "D": 0.0015764759154990315 + "A": 0.8565446138381958, + "B": 0.05475706234574318, + "C": 0.013844753615558147, + "D": 0.0006475277477875352 }, "sample": { "messages": [ @@ -47290,7 +47290,7 @@ "prompt_len": 70, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -47302,10 +47302,10 @@ ] }, "predict": { - "A": 0.03622095286846161, - "B": 0.9341506958007812, - "C": 0.003817660501226783, - "D": 0.0015914378454908729 + "A": 0.03951166197657585, + "B": 0.8992812633514404, + "C": 0.0026888023130595684, + "D": 0.0034524905495345592 }, "sample": { "messages": [ @@ -47347,10 +47347,10 @@ ] }, "predict": { - "A": 0.0251067616045475, - "B": 0.9421243071556091, - "C": 0.004943814594298601, - "D": 0.0023352927528321743 + "A": 0.03068062849342823, + "B": 0.8966193795204163, + "C": 0.0053314934484660625, + "D": 0.002087842905893922 }, "sample": { "messages": [ @@ -47392,10 +47392,10 @@ ] }, "predict": { - "A": 0.2180049866437912, - "B": 0.0010095633333548903, - "C": 0.00018675044702831656, - "D": 0.7609121203422546 + "A": 0.07394568622112274, + "B": 0.0001427487441105768, + "C": 0.00011117283429484814, + "D": 0.9008428454399109 }, "sample": { "messages": [ @@ -47437,10 +47437,10 @@ ] }, "predict": { - "A": 0.9547809362411499, - "B": 0.007289842236787081, - "C": 0.0034434779081493616, - "D": 0.00046602406655438244 + "A": 0.8785117268562317, + "B": 0.0010286314645782113, + "C": 0.000907764071598649, + "D": 0.0001228525215992704 }, "sample": { "messages": [ @@ -47482,10 +47482,10 @@ ] }, "predict": { - "A": 0.0649067685008049, - "B": 0.8960102796554565, - "C": 0.0036617894656956196, - "D": 0.014482657425105572 + "A": 0.08038710057735443, + "B": 0.8642428517341614, + "C": 0.0017759824404492974, + "D": 0.009600871242582798 }, "sample": { "messages": [ @@ -47520,17 +47520,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.08703158795833588, - "B": 0.7287065386772156, - "C": 0.007143987808376551, - "D": 0.14349082112312317 + "A": 0.35623055696487427, + "B": 0.31437236070632935, + "C": 0.0083777392283082, + "D": 0.2774326205253601 }, "sample": { "messages": [ @@ -47560,22 +47560,22 @@ "prompt_len": 89, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "C" + "D" ] }, "predict": { - "A": 0.08466822654008865, - "B": 0.027487752959132195, - "C": 0.4872320592403412, - "D": 0.37945669889450073 + "A": 0.02763265371322632, + "B": 0.00291246036067605, + "C": 0.38145703077316284, + "D": 0.5550166964530945 }, "sample": { "messages": [ @@ -47605,7 +47605,7 @@ "prompt_len": 85, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " D" } } { @@ -47617,10 +47617,10 @@ ] }, "predict": { - "A": 0.9689573645591736, - "B": 0.0034946061205118895, - "C": 0.00047294353134930134, - "D": 0.0002868547453545034 + "A": 0.8924700617790222, + "B": 0.0004092180752195418, + "C": 0.0001414219877915457, + "D": 4.887413160759024e-05 }, "sample": { "messages": [ @@ -47662,10 +47662,10 @@ ] }, "predict": { - "A": 0.01541314646601677, - "B": 0.9535776376724243, - "C": 0.0023636827245354652, - "D": 0.005003916099667549 + "A": 0.033288031816482544, + "B": 0.8585096597671509, + "C": 0.00479559600353241, + "D": 0.017817799001932144 }, "sample": { "messages": [ @@ -47707,10 +47707,10 @@ ] }, "predict": { - "A": 2.7378517188481055e-05, - "B": 0.0006231321021914482, - "C": 0.994264543056488, - "D": 0.00022923749929759651 + "A": 8.889404853107408e-06, + "B": 5.796619734610431e-05, + "C": 0.9943654537200928, + "D": 2.1324573026504368e-05 }, "sample": { "messages": [ @@ -47745,17 +47745,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "D" ] }, "predict": { - "A": 0.2844873368740082, - "B": 0.029984746128320694, - "C": 0.36528897285461426, - "D": 0.2844873368740082 + "A": 0.35283952951431274, + "B": 0.010009285993874073, + "C": 0.06947822123765945, + "D": 0.45305490493774414 }, "sample": { "messages": [ @@ -47785,7 +47785,7 @@ "prompt_len": 81, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " D" } } { @@ -47797,10 +47797,10 @@ ] }, "predict": { - "A": 0.9183299541473389, - "B": 0.0403486043214798, - "C": 0.0013806532369926572, - "D": 0.0022763125598430634 + "A": 0.8503229022026062, + "B": 0.0009956257417798042, + "C": 0.0002679690660443157, + "D": 0.00018417224055156112 }, "sample": { "messages": [ @@ -47842,10 +47842,10 @@ ] }, "predict": { - "A": 0.00079326395643875, - "B": 0.004028526600450277, - "C": 0.9857479929924011, - "D": 0.00079326395643875 + "A": 0.0005134252714924514, + "B": 0.000795208674389869, + "C": 0.9881645441055298, + "D": 0.00037562998477369547 }, "sample": { "messages": [ @@ -47887,10 +47887,10 @@ ] }, "predict": { - "A": 0.2788456976413727, - "B": 0.5209523439407349, - "C": 0.14925535023212433, - "D": 0.017826007679104805 + "A": 0.28645381331443787, + "B": 0.6064226627349854, + "C": 0.03019200637936592, + "D": 0.005945159122347832 }, "sample": { "messages": [ @@ -47932,10 +47932,10 @@ ] }, "predict": { - "A": 0.6669629216194153, - "B": 0.054747648537158966, - "C": 0.13133276998996735, - "D": 0.11590076982975006 + "A": 0.769661545753479, + "B": 0.010978631675243378, + "C": 0.11803141981363297, + "D": 0.026336368173360825 }, "sample": { "messages": [ @@ -47977,10 +47977,10 @@ ] }, "predict": { - "A": 0.0009863055311143398, - "B": 0.9545221328735352, - "C": 0.0008704115753062069, - "D": 0.0039009139873087406 + "A": 0.0003785578301176429, + "B": 0.9355303645133972, + "C": 0.00022960695787332952, + "D": 0.001805998501367867 }, "sample": { "messages": [ @@ -48022,10 +48022,10 @@ ] }, "predict": { - "A": 0.8748838901519775, - "B": 0.08137687295675278, - "C": 0.001490469672717154, - "D": 0.0031553239095956087 + "A": 0.9092736840248108, + "B": 0.0047714305110275745, + "C": 0.0003679331857711077, + "D": 0.00028654668130911887 }, "sample": { "messages": [ @@ -48067,10 +48067,10 @@ ] }, "predict": { - "A": 0.0011578969424590468, - "B": 0.0011578969424590468, - "C": 0.9889121055603027, - "D": 0.0024512680247426033 + "A": 0.00040157584589906037, + "B": 0.00014773149450775236, + "C": 0.9924147725105286, + "D": 0.0005488891038112342 }, "sample": { "messages": [ @@ -48112,10 +48112,10 @@ ] }, "predict": { - "A": 0.001640137517824769, - "B": 0.003064180025830865, - "C": 0.00046990724513307214, - "D": 0.9627367258071899 + "A": 0.0003470281953923404, + "B": 0.0002702658239286393, + "C": 0.00013589818263426423, + "D": 0.9129222631454468 }, "sample": { "messages": [ @@ -48157,10 +48157,10 @@ ] }, "predict": { - "A": 0.07220251858234406, - "B": 0.8796067237854004, - "C": 0.00407338747754693, - "D": 0.016110559925436974 + "A": 0.24178385734558105, + "B": 0.657236635684967, + "C": 0.004714034032076597, + "D": 0.03272188454866409 }, "sample": { "messages": [ @@ -48202,10 +48202,10 @@ ] }, "predict": { - "A": 0.9531792998313904, - "B": 0.006422471720725298, - "C": 0.0008691870025359094, - "D": 0.001116058207117021 + "A": 0.8794493079185486, + "B": 0.0006648431881330907, + "C": 0.00017894011398311704, + "D": 0.00022976363834459335 }, "sample": { "messages": [ @@ -48247,10 +48247,10 @@ ] }, "predict": { - "A": 0.25852108001708984, - "B": 0.008846104145050049, - "C": 0.0017419012729078531, - "D": 0.7027330994606018 + "A": 0.19513264298439026, + "B": 0.0008488944149576128, + "C": 0.000514880521222949, + "D": 0.7717645764350891 }, "sample": { "messages": [ @@ -48292,10 +48292,10 @@ ] }, "predict": { - "A": 5.7418506912654266e-05, - "B": 0.0003744157147593796, - "C": 3.0733914172742516e-05, - "D": 0.984970211982727 + "A": 2.144811332982499e-05, + "B": 0.00015848129987716675, + "C": 9.51754191191867e-06, + "D": 0.9395315051078796 }, "sample": { "messages": [ @@ -48333,14 +48333,14 @@ "acc": false, "f1_macro": [ "B", - "C" + "A" ] }, "predict": { - "A": 0.37706440687179565, - "B": 0.12241489440202713, - "C": 0.42726993560791016, - "D": 0.03974229469895363 + "A": 0.6783570051193237, + "B": 0.01595342718064785, + "C": 0.2495536208152771, + "D": 0.0066503784619271755 }, "sample": { "messages": [ @@ -48370,7 +48370,7 @@ "prompt_len": 84, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -48382,10 +48382,10 @@ ] }, "predict": { - "A": 0.7256073355674744, - "B": 0.008060770109295845, - "C": 0.20788998901844025, - "D": 0.0005839202203787863 + "A": 0.708358645439148, + "B": 0.0017558455001562834, + "C": 0.20294815301895142, + "D": 0.0002529534976929426 }, "sample": { "messages": [ @@ -48427,10 +48427,10 @@ ] }, "predict": { - "A": 0.004031549207866192, - "B": 0.001680599874816835, - "C": 0.9864875674247742, - "D": 0.0002920444530900568 + "A": 0.0045588258653879166, + "B": 0.00031023198971524835, + "C": 0.9844322204589844, + "D": 5.7387147535337135e-05 }, "sample": { "messages": [ @@ -48468,14 +48468,14 @@ "acc": false, "f1_macro": [ "D", - "B" + "A" ] }, "predict": { - "A": 0.21926485002040863, - "B": 0.2484596222639084, - "C": 0.13299085199832916, - "D": 0.1935005486011505 + "A": 0.3750138282775879, + "B": 0.13795989751815796, + "C": 0.22745738923549652, + "D": 0.13795989751815796 }, "sample": { "messages": [ @@ -48505,7 +48505,7 @@ "prompt_len": 65, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -48517,10 +48517,10 @@ ] }, "predict": { - "A": 0.5733725428581238, - "B": 0.06847958266735077, - "C": 0.3069041967391968, - "D": 0.0005924644647166133 + "A": 0.6407325863838196, + "B": 0.009139559231698513, + "C": 0.18357296288013458, + "D": 0.00016739685088396072 }, "sample": { "messages": [ @@ -48562,10 +48562,10 @@ ] }, "predict": { - "A": 0.0016714995726943016, - "B": 0.0011488035088405013, - "C": 0.00037296192022040486, - "D": 0.9811457991600037 + "A": 0.0007229549228213727, + "B": 0.00015153958520386368, + "C": 0.00015153958520386368, + "D": 0.9563190340995789 }, "sample": { "messages": [ @@ -48607,10 +48607,10 @@ ] }, "predict": { - "A": 0.0028415892738848925, - "B": 0.2898499667644501, - "C": 0.0017235110281035304, - "D": 0.6953139305114746 + "A": 0.0018270385917276144, + "B": 0.44706159830093384, + "C": 0.0014228991931304336, + "D": 0.5065871477127075 }, "sample": { "messages": [ @@ -48652,10 +48652,10 @@ ] }, "predict": { - "A": 0.009545521810650826, - "B": 0.003979165572673082, - "C": 0.00129184580873698, - "D": 0.9736696481704712 + "A": 0.0049914345145225525, + "B": 0.0004361466853879392, + "C": 0.00038489807047881186, + "D": 0.9511989951133728 }, "sample": { "messages": [ @@ -48697,10 +48697,10 @@ ] }, "predict": { - "A": 0.9524503946304321, - "B": 0.005663476884365082, - "C": 0.0009841647697612643, - "D": 0.002360888756811619 + "A": 0.9026392102241516, + "B": 0.0009326949366368353, + "C": 0.0002672215923666954, + "D": 0.00053143355762586 }, "sample": { "messages": [ @@ -48742,10 +48742,10 @@ ] }, "predict": { - "A": 0.005125820636749268, - "B": 0.9768085479736328, - "C": 0.002136759925633669, - "D": 0.00047677563270553946 + "A": 0.00257248617708683, + "B": 0.9158684611320496, + "C": 0.0017680421005934477, + "D": 0.00037060174508951604 }, "sample": { "messages": [ @@ -48787,10 +48787,10 @@ ] }, "predict": { - "A": 0.943061113357544, - "B": 0.03656644746661186, - "C": 0.0009744628332555294, - "D": 0.001606617821380496 + "A": 0.833646833896637, + "B": 0.06038917601108551, + "C": 0.0011060662800446153, + "D": 0.0007601875113323331 }, "sample": { "messages": [ @@ -48832,10 +48832,10 @@ ] }, "predict": { - "A": 0.23912979662418365, - "B": 0.0026564921718090773, - "C": 0.0018257786286994815, - "D": 0.7365716099739075 + "A": 0.022154564037919044, + "B": 0.00035809515975415707, + "C": 0.0003363992436788976, + "D": 0.942035973072052 }, "sample": { "messages": [ @@ -48877,10 +48877,10 @@ ] }, "predict": { - "A": 0.012228431180119514, - "B": 0.0015546706272289157, - "C": 0.00017442919488530606, - "D": 0.9714245796203613 + "A": 0.013755284249782562, + "B": 0.00023667275672778487, + "C": 3.629493949119933e-05, + "D": 0.9643198847770691 }, "sample": { "messages": [ @@ -48922,10 +48922,10 @@ ] }, "predict": { - "A": 0.046546149998903275, - "B": 0.9349044561386108, - "C": 0.00040270312456414104, - "D": 0.003371792146936059 + "A": 0.20886944234371185, + "B": 0.7290259599685669, + "C": 0.0008018845110200346, + "D": 0.003376060165464878 }, "sample": { "messages": [ @@ -48967,10 +48967,10 @@ ] }, "predict": { - "A": 0.0027014072984457016, - "B": 0.0073431855998933315, - "C": 0.008320920169353485, - "D": 0.9617675542831421 + "A": 0.0009726338321343064, + "B": 0.0015064447652548552, + "C": 0.002059065969660878, + "D": 0.9412910342216492 }, "sample": { "messages": [ @@ -49012,10 +49012,10 @@ ] }, "predict": { - "A": 0.001465466571971774, - "B": 0.006567765958607197, - "C": 0.00047576738870702684, - "D": 0.9747428894042969 + "A": 0.0007761466549709439, + "B": 0.0007761466549709439, + "C": 9.269749716622755e-05, + "D": 0.9644771814346313 }, "sample": { "messages": [ @@ -49057,10 +49057,10 @@ ] }, "predict": { - "A": 0.8595454096794128, - "B": 0.09059541672468185, - "C": 0.002735744696110487, - "D": 0.0039804852567613125 + "A": 0.8108217120170593, + "B": 0.08545997738838196, + "C": 0.00746740959584713, + "D": 0.002424312988296151 }, "sample": { "messages": [ @@ -49102,10 +49102,10 @@ ] }, "predict": { - "A": 0.09169259667396545, - "B": 0.01593378558754921, - "C": 0.8699551820755005, - "D": 0.0007932964363135397 + "A": 0.09182393550872803, + "B": 0.003560400567948818, + "C": 0.8712012767791748, + "D": 0.00020086404401808977 }, "sample": { "messages": [ @@ -49140,17 +49140,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.30457040667533875, - "B": 0.6447755098342896, - "C": 0.0008554754895158112, - "D": 0.028329459950327873 + "A": 0.7658472061157227, + "B": 0.1330842822790146, + "C": 0.0003738064260687679, + "D": 0.00516023812815547 }, "sample": { "messages": [ @@ -49180,7 +49180,7 @@ "prompt_len": 69, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -49192,10 +49192,10 @@ ] }, "predict": { - "A": 0.0005340754869394004, - "B": 0.9656311869621277, - "C": 0.0018641067435964942, - "D": 0.0008805416873656213 + "A": 0.0006642324733547866, + "B": 0.8786414861679077, + "C": 0.002177934627979994, + "D": 0.0008528913604095578 }, "sample": { "messages": [ @@ -49237,10 +49237,10 @@ ] }, "predict": { - "A": 0.0161834005266428, - "B": 0.8835837244987488, - "C": 0.07252897322177887, - "D": 0.005953540559858084 + "A": 0.01531039085239172, + "B": 0.7376959919929504, + "C": 0.18651875853538513, + "D": 0.001946497824974358 }, "sample": { "messages": [ @@ -49282,10 +49282,10 @@ ] }, "predict": { - "A": 0.0014656628482043743, - "B": 0.9748733639717102, - "C": 7.297105912584811e-05, - "D": 5.0152226322097704e-05 + "A": 0.004273680038750172, + "B": 0.9228578209877014, + "C": 5.3797688451595604e-05, + "D": 3.6974572140024975e-05 }, "sample": { "messages": [ @@ -49327,10 +49327,10 @@ ] }, "predict": { - "A": 0.008288964629173279, - "B": 0.9580740332603455, - "C": 0.013666192069649696, - "D": 0.009392628446221352 + "A": 0.023545652627944946, + "B": 0.8835440874099731, + "C": 0.018337372690439224, + "D": 0.043989069759845734 }, "sample": { "messages": [ @@ -49372,10 +49372,10 @@ ] }, "predict": { - "A": 0.1748504340648651, - "B": 0.6915468573570251, - "C": 0.026814181357622147, - "D": 0.05676562339067459 + "A": 0.20587699115276337, + "B": 0.5596316456794739, + "C": 0.0357760526239872, + "D": 0.07573790848255157 }, "sample": { "messages": [ @@ -49417,10 +49417,10 @@ ] }, "predict": { - "A": 0.01550796814262867, - "B": 0.003921028226613998, - "C": 0.0004985028062947094, - "D": 0.959443986415863 + "A": 0.015341262333095074, + "B": 0.0011829911964014173, + "C": 0.0002329447743250057, + "D": 0.9491302967071533 }, "sample": { "messages": [ @@ -49462,10 +49462,10 @@ ] }, "predict": { - "A": 0.019638191908597946, - "B": 0.00637558801099658, - "C": 0.946221113204956, - "D": 0.00637558801099658 + "A": 0.00512317568063736, + "B": 0.001295341644436121, + "C": 0.9763044714927673, + "D": 0.0013788840733468533 }, "sample": { "messages": [ @@ -49507,10 +49507,10 @@ ] }, "predict": { - "A": 0.0024252990260720253, - "B": 0.0018888246268033981, - "C": 0.9784353971481323, - "D": 0.006592645775526762 + "A": 0.0002570060605648905, + "B": 0.00021306550479494035, + "C": 0.9837230443954468, + "D": 0.0007916344329714775 }, "sample": { "messages": [ @@ -49552,10 +49552,10 @@ ] }, "predict": { - "A": 0.002440662821754813, - "B": 0.0021538774017244577, - "C": 0.9846336245536804, - "D": 6.50414585834369e-05 + "A": 0.0007936631445772946, + "B": 0.00021361152175813913, + "C": 0.9862440228462219, + "D": 1.547396823298186e-05 }, "sample": { "messages": [ @@ -49597,10 +49597,10 @@ ] }, "predict": { - "A": 0.0012856594985350966, - "B": 0.9690069556236267, - "C": 0.0005359426140785217, - "D": 0.013822141103446484 + "A": 0.0013504638336598873, + "B": 0.8982497453689575, + "C": 0.0004968079156242311, + "D": 0.030736414715647697 }, "sample": { "messages": [ @@ -49642,10 +49642,10 @@ ] }, "predict": { - "A": 0.11403631418943405, - "B": 0.8426207304000854, - "C": 0.0016266406746581197, - "D": 0.025444941595196724 + "A": 0.11049576848745346, + "B": 0.8164594173431396, + "C": 0.0016777896089479327, + "D": 0.02793770469725132 }, "sample": { "messages": [ @@ -49687,10 +49687,10 @@ ] }, "predict": { - "A": 0.14257696270942688, - "B": 0.8204737901687622, - "C": 0.006264394614845514, - "D": 0.002033751457929611 + "A": 0.13743481040000916, + "B": 0.7908827662467957, + "C": 0.0087859146296978, + "D": 0.0028523686341941357 }, "sample": { "messages": [ @@ -49732,10 +49732,10 @@ ] }, "predict": { - "A": 0.00020208285422995687, - "B": 0.00020208285422995687, - "C": 0.9931904673576355, - "D": 0.0002940288104582578 + "A": 0.00011419597285566851, + "B": 7.373046537395567e-05, + "C": 0.985018789768219, + "D": 6.506689533125609e-05 }, "sample": { "messages": [ @@ -49777,10 +49777,10 @@ ] }, "predict": { - "A": 0.9729887843132019, - "B": 0.003976382780820131, - "C": 0.00047491121222265065, - "D": 0.0011392526794224977 + "A": 0.9308329820632935, + "B": 0.0007036880124360323, + "C": 0.00021461272262968123, + "D": 0.0004268083721399307 }, "sample": { "messages": [ @@ -49822,10 +49822,10 @@ ] }, "predict": { - "A": 0.012282266281545162, - "B": 0.9757012128829956, - "C": 0.0021343377884477377, - "D": 0.0006929180235601962 + "A": 0.00944623164832592, + "B": 0.9635417461395264, + "C": 0.0018600731855258346, + "D": 0.0006428241031244397 }, "sample": { "messages": [ @@ -49867,10 +49867,10 @@ ] }, "predict": { - "A": 0.700942873954773, - "B": 0.2275628298521042, - "C": 0.009998412802815437, - "D": 0.004722916055470705 + "A": 0.8331286311149597, + "B": 0.013466269709169865, + "C": 0.004953963682055473, + "D": 0.0011053787311539054 }, "sample": { "messages": [ @@ -49912,10 +49912,10 @@ ] }, "predict": { - "A": 0.022955317050218582, - "B": 0.19220250844955444, - "C": 0.0009474791586399078, - "D": 0.7601757049560547 + "A": 0.011907949112355709, + "B": 0.12802252173423767, + "C": 0.000461721298051998, + "D": 0.8348116874694824 }, "sample": { "messages": [ @@ -49957,10 +49957,10 @@ ] }, "predict": { - "A": 0.004019354935735464, - "B": 0.0035470682196319103, - "C": 0.9835036993026733, - "D": 0.000791457889135927 + "A": 0.0024403641000390053, + "B": 0.000699175987392664, + "C": 0.9845131039619446, + "D": 0.00021323662076611072 }, "sample": { "messages": [ @@ -50002,10 +50002,10 @@ ] }, "predict": { - "A": 0.0012971078976988792, - "B": 0.9776356816291809, - "C": 0.0008914883946999907, - "D": 0.0006942918407730758 + "A": 0.0018006000900641084, + "B": 0.932733952999115, + "C": 0.0006222707452252507, + "D": 0.0005845692357979715 }, "sample": { "messages": [ @@ -50047,10 +50047,10 @@ ] }, "predict": { - "A": 0.7387480139732361, - "B": 0.23983636498451233, - "C": 0.0008649849914945662, - "D": 0.0008125781896524131 + "A": 0.9382789731025696, + "B": 0.028333570808172226, + "C": 0.0001312104577664286, + "D": 0.00012326081923674792 }, "sample": { "messages": [ @@ -50092,10 +50092,10 @@ ] }, "predict": { - "A": 0.0021292620804160833, - "B": 0.008421394973993301, - "C": 0.9733808636665344, - "D": 0.0016582710668444633 + "A": 0.0020056464709341526, + "B": 0.0018841305281966925, + "C": 0.9760037660598755, + "D": 0.0006116876029409468 }, "sample": { "messages": [ @@ -50137,10 +50137,10 @@ ] }, "predict": { - "A": 0.8983308672904968, - "B": 0.03946990147233009, - "C": 0.01645350456237793, - "D": 0.0015304143307730556 + "A": 0.8738388419151306, + "B": 0.005887879524379969, + "C": 0.0007032069261185825, + "D": 0.00024302181554958224 }, "sample": { "messages": [ @@ -50175,17 +50175,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.00590743450447917, - "B": 0.8767409920692444, - "C": 0.049462344497442245, - "D": 0.014171198010444641 + "A": 0.010071756318211555, + "B": 0.03515388071537018, + "C": 0.9066305160522461, + "D": 0.00784389115869999 }, "sample": { "messages": [ @@ -50215,7 +50215,7 @@ "prompt_len": 75, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -50227,10 +50227,10 @@ ] }, "predict": { - "A": 0.0001579713134560734, - "B": 0.00020283920457586646, - "C": 0.9969077706336975, - "D": 0.0001579713134560734 + "A": 5.439412416308187e-05, + "B": 3.5119486710755154e-05, + "C": 0.9932683110237122, + "D": 3.979558823630214e-05 }, "sample": { "messages": [ @@ -50272,10 +50272,10 @@ ] }, "predict": { - "A": 0.9633802771568298, - "B": 0.005055355839431286, - "C": 0.0005328305996954441, - "D": 0.0006037761340849102 + "A": 0.8793239593505859, + "B": 0.0009086033678613603, + "C": 0.00017891460447572172, + "D": 0.00011551596253411844 }, "sample": { "messages": [ @@ -50310,17 +50310,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.2866857945919037, - "B": 0.6877234578132629, - "C": 0.0031847916543483734, - "D": 0.002480318071320653 + "A": 0.5842610001564026, + "B": 0.31273239850997925, + "C": 0.003263655351474881, + "D": 0.0027056655380874872 }, "sample": { "messages": [ @@ -50350,7 +50350,7 @@ "prompt_len": 72, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -50362,10 +50362,10 @@ ] }, "predict": { - "A": 0.9684397578239441, - "B": 0.0014559902483597398, - "C": 3.424160968279466e-05, - "D": 6.397174729499966e-05 + "A": 0.851603627204895, + "B": 0.00015291411546058953, + "C": 1.5140549294301309e-05, + "D": 1.1791470569733065e-05 }, "sample": { "messages": [ @@ -50407,10 +50407,10 @@ ] }, "predict": { - "A": 0.0031387447379529476, - "B": 0.0021572255063802004, - "C": 0.9861642718315125, - "D": 0.0013084234669804573 + "A": 0.0007013180293142796, + "B": 0.00042537087574601173, + "C": 0.9875292778015137, + "D": 0.000794697436504066 }, "sample": { "messages": [ @@ -50452,10 +50452,10 @@ ] }, "predict": { - "A": 0.9374455809593201, - "B": 0.03634870797395706, - "C": 0.0033809568267315626, - "D": 0.0009686602861620486 + "A": 0.8864080905914307, + "B": 0.005270774941891432, + "C": 0.0015101025346666574, + "D": 0.00020437012426555157 }, "sample": { "messages": [ @@ -50490,17 +50490,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "B" + "D" ] }, "predict": { - "A": 0.020002108067274094, - "B": 0.5845472812652588, - "C": 0.0226653590798378, - "D": 0.35454586148262024 + "A": 0.013349193148314953, + "B": 0.11177141964435577, + "C": 0.01512661762535572, + "D": 0.8258852958679199 }, "sample": { "messages": [ @@ -50530,7 +50530,7 @@ "prompt_len": 91, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { @@ -50542,10 +50542,10 @@ ] }, "predict": { - "A": 0.056442514061927795, - "B": 0.8829095363616943, - "C": 0.00408868258818984, - "D": 0.038792338222265244 + "A": 0.01869157887995243, + "B": 0.7947861552238464, + "C": 0.002097137039527297, + "D": 0.15650266408920288 }, "sample": { "messages": [ @@ -50587,10 +50587,10 @@ ] }, "predict": { - "A": 0.01996568590402603, - "B": 0.008322936482727528, - "C": 0.9620006084442139, - "D": 0.00026754033751785755 + "A": 0.01091026235371828, + "B": 0.0008955688099376857, + "C": 0.9821104407310486, + "D": 3.472497337497771e-05 }, "sample": { "messages": [ @@ -50632,10 +50632,10 @@ ] }, "predict": { - "A": 0.953673779964447, - "B": 0.0016246977029368281, - "C": 0.0006772747146897018, - "D": 0.00021987888612784445 + "A": 0.8798680901527405, + "B": 0.0003141992201562971, + "C": 0.0003344633732922375, + "D": 7.94420120655559e-05 }, "sample": { "messages": [ @@ -50670,17 +50670,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.4529598355293274, - "B": 0.008296248503029346, - "C": 0.0016336282715201378, - "D": 0.5132707357406616 + "A": 0.5428593754768372, + "B": 0.0013456139713525772, + "C": 0.0008161560399457812, + "D": 0.4227793216705322 }, "sample": { "messages": [ @@ -50710,7 +50710,7 @@ "prompt_len": 96, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -50722,10 +50722,10 @@ ] }, "predict": { - "A": 0.01069173589348793, - "B": 0.9624393582344055, - "C": 0.0006834997911937535, - "D": 0.003063233569264412 + "A": 0.026270149275660515, + "B": 0.8699478507041931, + "C": 0.0003106567310169339, + "D": 0.002768853446468711 }, "sample": { "messages": [ @@ -50767,10 +50767,10 @@ ] }, "predict": { - "A": 0.001899070804938674, - "B": 0.9837430119514465, - "C": 0.00015588522364851087, - "D": 0.00027358709485270083 + "A": 0.003060225397348404, + "B": 0.9614942073822021, + "C": 0.0002359792561037466, + "D": 0.0007268672343343496 }, "sample": { "messages": [ @@ -50812,10 +50812,10 @@ ] }, "predict": { - "A": 0.003447163151577115, - "B": 0.9558027982711792, - "C": 0.00036332831950858235, - "D": 0.0007691660430282354 + "A": 0.002240278758108616, + "B": 0.9037929177284241, + "C": 0.0008241524337790906, + "D": 0.001358797657303512 }, "sample": { "messages": [ @@ -50857,10 +50857,10 @@ ] }, "predict": { - "A": 0.00029143079882487655, - "B": 0.9844146966934204, - "C": 5.0643076974665746e-05, - "D": 0.0006169589469209313 + "A": 0.0002147163322661072, + "B": 0.9312824010848999, + "C": 4.5006992877461016e-05, + "D": 0.00045455453800968826 }, "sample": { "messages": [ @@ -50902,10 +50902,10 @@ ] }, "predict": { - "A": 0.003531053429469466, - "B": 0.9790632724761963, - "C": 0.00042172419489361346, - "D": 0.001011663698591292 + "A": 0.007236299104988575, + "B": 0.9477681517601013, + "C": 0.00040824408642947674, + "D": 0.0005241957842372358 }, "sample": { "messages": [ @@ -50947,10 +50947,10 @@ ] }, "predict": { - "A": 0.11891725659370422, - "B": 0.07212695479393005, - "C": 0.7754378914833069, - "D": 0.005224859341979027 + "A": 0.07309255748987198, + "B": 0.01630915328860283, + "C": 0.8904496431350708, + "D": 0.000920099497307092 }, "sample": { "messages": [ @@ -50992,10 +50992,10 @@ ] }, "predict": { - "A": 0.0010188908781856298, - "B": 0.0006179885240271688, - "C": 0.0001378918968839571, - "D": 0.986057460308075 + "A": 0.0003150132833980024, + "B": 9.607362153474241e-05, + "C": 3.5343509807717055e-05, + "D": 0.9390413761138916 }, "sample": { "messages": [ @@ -51037,10 +51037,10 @@ ] }, "predict": { - "A": 0.9564654231071472, - "B": 0.003908855374902487, - "C": 0.0008721835329197347, - "D": 0.00046684624976478517 + "A": 0.903404712677002, + "B": 0.0004409475950524211, + "C": 0.00036555834230966866, + "D": 0.0002082888677250594 }, "sample": { "messages": [ @@ -51082,10 +51082,10 @@ ] }, "predict": { - "A": 0.0007983386749401689, - "B": 5.783145752502605e-05, - "C": 1.877512659120839e-05, - "D": 0.9920541048049927 + "A": 0.0007656338275410235, + "B": 3.160148116876371e-05, + "C": 4.057710611959919e-05, + "D": 0.9514135122299194 }, "sample": { "messages": [ @@ -51127,10 +51127,10 @@ ] }, "predict": { - "A": 0.0007299423450604081, - "B": 0.9655619263648987, - "C": 0.00028584952815435827, - "D": 0.000415908609284088 + "A": 0.0008211226086132228, + "B": 0.9004702568054199, + "C": 0.00032155620283447206, + "D": 0.0005301565979607403 }, "sample": { "messages": [ @@ -51172,10 +51172,10 @@ ] }, "predict": { - "A": 0.9512173533439636, - "B": 0.011974059976637363, - "C": 0.0016205129213631153, - "D": 0.0012620565248653293 + "A": 0.8918479681015015, + "B": 0.0010442466009408236, + "C": 0.0002056243538390845, + "D": 0.00011006278509739786 }, "sample": { "messages": [ @@ -51217,10 +51217,10 @@ ] }, "predict": { - "A": 0.9564900398254395, - "B": 0.010625645518302917, - "C": 0.004429427906870842, - "D": 0.000988338841125369 + "A": 0.9095693826675415, + "B": 0.0015495604602620006, + "C": 0.0032804193906486034, + "D": 0.0003917902067769319 }, "sample": { "messages": [ @@ -51262,10 +51262,10 @@ ] }, "predict": { - "A": 0.007413564715534449, - "B": 0.003501920262351632, - "C": 0.9709854125976562, - "D": 0.001459817518480122 + "A": 0.001006816397421062, + "B": 0.00041970351594500244, + "C": 0.9743720889091492, + "D": 0.00030706165125593543 }, "sample": { "messages": [ @@ -51307,10 +51307,10 @@ ] }, "predict": { - "A": 0.48673921823501587, - "B": 0.016655299812555313, - "C": 0.014698251150548458, - "D": 0.4295458495616913 + "A": 0.6805918216705322, + "B": 0.003355036722496152, + "C": 0.004881549626588821, + "D": 0.22095581889152527 }, "sample": { "messages": [ @@ -51352,10 +51352,10 @@ ] }, "predict": { - "A": 0.005839325953274965, - "B": 0.0013029297115281224, - "C": 5.052006963524036e-05, - "D": 0.9820236563682556 + "A": 0.001862070639617741, + "B": 0.00011182604066561908, + "C": 1.3355715054785833e-05, + "D": 0.9645764827728271 }, "sample": { "messages": [ @@ -51397,10 +51397,10 @@ ] }, "predict": { - "A": 0.9707452654838562, - "B": 0.007411731407046318, - "C": 0.00028738402761518955, - "D": 0.00013575061166193336 + "A": 0.9039384126663208, + "B": 0.00044120807433500886, + "C": 3.196100442437455e-05, + "D": 1.2516108654381242e-05 }, "sample": { "messages": [ @@ -51442,10 +51442,10 @@ ] }, "predict": { - "A": 0.001462999265640974, - "B": 0.0018785282736644149, - "C": 0.00014485654537566006, - "D": 0.9731017351150513 + "A": 0.0006551207625307143, + "B": 0.0004229777550790459, + "C": 6.093571937526576e-05, + "D": 0.9224786758422852 }, "sample": { "messages": [ @@ -51487,10 +51487,10 @@ ] }, "predict": { - "A": 0.22198432683944702, - "B": 0.030042309314012527, - "C": 0.22198432683944702, - "D": 0.46994081139564514 + "A": 0.23389588296413422, + "B": 0.010939455591142178, + "C": 0.18215830624103546, + "D": 0.4951575696468353 }, "sample": { "messages": [ @@ -51532,10 +51532,10 @@ ] }, "predict": { - "A": 0.0025014036800712347, - "B": 0.0828351080417633, - "C": 0.001181579427793622, - "D": 0.8905613422393799 + "A": 0.0007768682553432882, + "B": 0.024167662486433983, + "C": 0.00032384684891439974, + "D": 0.9068848490715027 }, "sample": { "messages": [ @@ -51577,10 +51577,10 @@ ] }, "predict": { - "A": 0.1926691234111786, - "B": 0.017921021208167076, - "C": 0.7620211839675903, - "D": 0.00025562962400726974 + "A": 0.4244001507759094, + "B": 0.006053743418306112, + "C": 0.4809083938598633, + "D": 0.00024986782227642834 }, "sample": { "messages": [ @@ -51622,10 +51622,10 @@ ] }, "predict": { - "A": 0.21220102906227112, - "B": 0.3498603403568268, - "C": 0.3498603403568268, - "D": 0.004990489687770605 + "A": 0.07923999428749084, + "B": 0.6634683012962341, + "C": 0.13064464926719666, + "D": 0.0019837343133985996 }, "sample": { "messages": [ @@ -51655,7 +51655,7 @@ "prompt_len": 89, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " B" } } { @@ -51667,10 +51667,10 @@ ] }, "predict": { - "A": 0.8347536325454712, - "B": 0.05336401239037514, - "C": 0.07764418423175812, - "D": 0.013492535799741745 + "A": 0.9060765504837036, + "B": 0.011405820958316326, + "C": 0.03981022536754608, + "D": 0.006105095613747835 }, "sample": { "messages": [ @@ -51712,10 +51712,10 @@ ] }, "predict": { - "A": 0.39551761746406555, - "B": 0.575474739074707, - "C": 0.002351833740249276, - "D": 0.0018316099885851145 + "A": 0.12468221783638, + "B": 0.8130301833152771, + "C": 0.0008942840504460037, + "D": 0.0010787125211209059 }, "sample": { "messages": [ @@ -51757,10 +51757,10 @@ ] }, "predict": { - "A": 0.01760217361152172, - "B": 0.007337676826864481, - "C": 0.9610460996627808, - "D": 0.005043107084929943 + "A": 0.015732724219560623, + "B": 0.0018790060421451926, + "C": 0.9733492732048035, + "D": 0.0011396748013794422 }, "sample": { "messages": [ @@ -51802,10 +51802,10 @@ ] }, "predict": { - "A": 0.43013763427734375, - "B": 0.007878245785832405, - "C": 0.29562899470329285, - "D": 0.17930804193019867 + "A": 0.39440101385116577, + "B": 0.004115943796932697, + "C": 0.39440101385116577, + "D": 0.1280432641506195 }, "sample": { "messages": [ @@ -51835,7 +51835,7 @@ "prompt_len": 74, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -51847,10 +51847,10 @@ ] }, "predict": { - "A": 0.005779674742370844, - "B": 0.9719918966293335, - "C": 0.0002539411943871528, - "D": 0.0012896198313683271 + "A": 0.002573765115812421, + "B": 0.9163237810134888, + "C": 0.0001751468953443691, + "D": 0.0006927194190211594 }, "sample": { "messages": [ @@ -51892,10 +51892,10 @@ ] }, "predict": { - "A": 0.001279116258956492, - "B": 0.9640752673149109, - "C": 0.00030381674878299236, - "D": 0.00016262137796729803 + "A": 0.000928201770875603, + "B": 0.8982908129692078, + "C": 0.000220467263716273, + "D": 0.0002071098133455962 }, "sample": { "messages": [ @@ -51937,10 +51937,10 @@ ] }, "predict": { - "A": 0.0043522813357412815, - "B": 0.9398309588432312, - "C": 0.001601114752702415, - "D": 0.009213779121637344 + "A": 0.0017421654192730784, + "B": 0.9024640321731567, + "C": 0.00019546553085092455, + "D": 0.0010566767305135727 }, "sample": { "messages": [ @@ -51982,10 +51982,10 @@ ] }, "predict": { - "A": 0.004028587136417627, - "B": 0.0027688047848641872, - "C": 0.9857627153396606, - "D": 0.00025753892259672284 + "A": 0.0014914474450051785, + "B": 0.0006217277841642499, + "C": 0.9920238256454468, + "D": 0.00010804009798448533 }, "sample": { "messages": [ @@ -52027,10 +52027,10 @@ ] }, "predict": { - "A": 0.9113302230834961, - "B": 0.03533610701560974, - "C": 0.0032867700792849064, - "D": 0.0004735041002277285 + "A": 0.8623157143592834, + "B": 0.002744561992585659, + "C": 0.0005404362455010414, + "D": 6.0635273257503286e-05 }, "sample": { "messages": [ @@ -52072,10 +52072,10 @@ ] }, "predict": { - "A": 0.003122724127024412, - "B": 0.2810986340045929, - "C": 0.02614624612033367, - "D": 0.6743205785751343 + "A": 0.0005627117934636772, + "B": 0.21326006948947906, + "C": 0.005683187860995531, + "D": 0.7443507313728333 }, "sample": { "messages": [ @@ -52117,10 +52117,10 @@ ] }, "predict": { - "A": 0.3158567547798157, - "B": 0.6686687469482422, - "C": 0.00022431337856687605, - "D": 0.0001979558728635311 + "A": 0.3000594973564148, + "B": 0.6352258920669556, + "C": 0.00021309455041773617, + "D": 0.0001766615896485746 }, "sample": { "messages": [ @@ -52162,10 +52162,10 @@ ] }, "predict": { - "A": 0.00033003618591465056, - "B": 0.9838240146636963, - "C": 9.45569554460235e-05, - "D": 0.00020017707720398903 + "A": 0.00039943173760548234, + "B": 0.9273097515106201, + "C": 5.40572073077783e-05, + "D": 0.00011443911353126168 }, "sample": { "messages": [ @@ -52207,10 +52207,10 @@ ] }, "predict": { - "A": 0.0513821542263031, - "B": 0.02427120879292488, - "C": 0.9107704162597656, - "D": 0.0011351796565577388 + "A": 0.013245037756860256, + "B": 0.03600374609231949, + "C": 0.9285488724708557, + "D": 0.00031149343703873456 }, "sample": { "messages": [ @@ -52252,10 +52252,10 @@ ] }, "predict": { - "A": 0.0007970550213940442, - "B": 0.9904589653015137, - "C": 6.542625487782061e-05, - "D": 0.00010786967322928831 + "A": 0.0006191631546244025, + "B": 0.928075909614563, + "C": 6.13054507994093e-05, + "D": 7.871775596868247e-05 }, "sample": { "messages": [ @@ -52297,10 +52297,10 @@ ] }, "predict": { - "A": 0.0019096162868663669, - "B": 0.0013971051666885614, - "C": 0.9892057180404663, - "D": 0.0003760257677640766 + "A": 0.0011627988424152136, + "B": 0.00020206414046697319, + "C": 0.9930985569953918, + "D": 8.423285908065736e-05 }, "sample": { "messages": [ @@ -52342,10 +52342,10 @@ ] }, "predict": { - "A": 0.8873487114906311, - "B": 0.026795610785484314, - "C": 0.056726302951574326, - "D": 0.0067749908193945885 + "A": 0.9465566873550415, + "B": 0.003012682544067502, + "C": 0.007227049209177494, + "D": 0.0009188161930069327 }, "sample": { "messages": [ @@ -52387,10 +52387,10 @@ ] }, "predict": { - "A": 0.008071809075772762, - "B": 0.0023126122541725636, - "C": 0.04099202901124954, - "D": 0.9329742789268494 + "A": 0.0018126697978004813, + "B": 0.0004305468173697591, + "C": 0.17369499802589417, + "D": 0.7784469127655029 }, "sample": { "messages": [ @@ -52432,10 +52432,10 @@ ] }, "predict": { - "A": 0.15823887288570404, - "B": 0.17930813133716583, - "C": 0.6258468627929688, - "D": 0.010115872137248516 + "A": 0.019394027069211006, + "B": 0.028218142688274384, + "C": 0.9344565272331238, + "D": 0.0015919588040560484 }, "sample": { "messages": [ @@ -52477,10 +52477,10 @@ ] }, "predict": { - "A": 0.8665643334388733, - "B": 0.09133520722389221, - "C": 0.00022639732924290001, - "D": 5.0516075134510174e-05 + "A": 0.7365059852600098, + "B": 0.164336696267128, + "C": 0.0004073499294463545, + "D": 8.02119611762464e-05 }, "sample": { "messages": [ @@ -52522,10 +52522,10 @@ ] }, "predict": { - "A": 0.0006139924516901374, - "B": 0.9796813726425171, - "C": 4.1782710468396544e-05, - "D": 0.00012090228119632229 + "A": 0.0005528152687475085, + "B": 0.8820675611495972, + "C": 3.1187704735202715e-05, + "D": 7.964059477671981e-05 }, "sample": { "messages": [ @@ -52567,10 +52567,10 @@ ] }, "predict": { - "A": 0.003556503914296627, - "B": 0.001018955372273922, - "C": 0.9861199855804443, - "D": 0.000481320486869663 + "A": 0.0013889297842979431, + "B": 0.00012919059372507036, + "C": 0.9834172129631042, + "D": 0.00022673653438687325 }, "sample": { "messages": [ @@ -52612,10 +52612,10 @@ ] }, "predict": { - "A": 0.8829747438430786, - "B": 0.02353047952055931, - "C": 0.011115011759102345, - "D": 0.05644668638706207 + "A": 0.7699088454246521, + "B": 0.02051737532019615, + "C": 0.09195250272750854, + "D": 0.009691721759736538 }, "sample": { "messages": [ @@ -52657,10 +52657,10 @@ ] }, "predict": { - "A": 0.9404959678649902, - "B": 0.005592393223196268, - "C": 0.0011012075701728463, - "D": 0.0011012075701728463 + "A": 0.8662278652191162, + "B": 0.00030932831577956676, + "C": 0.000155540052219294, + "D": 0.00010042421490652487 }, "sample": { "messages": [ @@ -52702,10 +52702,10 @@ ] }, "predict": { - "A": 0.9407144784927368, - "B": 0.004936416633427143, - "C": 0.0023317981977015734, - "D": 0.008138774894177914 + "A": 0.8466419577598572, + "B": 0.0007252620998769999, + "C": 0.000931254995521158, + "D": 0.0034600321669131517 }, "sample": { "messages": [ @@ -52740,17 +52740,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.2617918848991394, - "B": 0.010150771588087082, - "C": 0.0019988056737929583, - "D": 0.711624026298523 + "A": 0.5457472801208496, + "B": 0.0018490203656256199, + "C": 0.0007707863696850836, + "D": 0.42502838373184204 }, "sample": { "messages": [ @@ -52780,7 +52780,7 @@ "prompt_len": 74, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -52792,10 +52792,10 @@ ] }, "predict": { - "A": 0.0014844569377601147, - "B": 0.004035172518342733, - "C": 0.9873741269111633, - "D": 0.0007012077840045094 + "A": 0.0006591234705410898, + "B": 0.0006191891734488308, + "C": 0.9879732131958008, + "D": 0.0002277869643876329 }, "sample": { "messages": [ @@ -52837,10 +52837,10 @@ ] }, "predict": { - "A": 0.0461987741291523, - "B": 0.0037922265473753214, - "C": 0.0004529168945737183, - "D": 0.9279272556304932 + "A": 0.23550011217594147, + "B": 0.000962435151450336, + "C": 0.0003540601464919746, + "D": 0.7253914475440979 }, "sample": { "messages": [ @@ -52882,10 +52882,10 @@ ] }, "predict": { - "A": 0.12810645997524261, - "B": 0.04159007966518402, - "C": 0.239334374666214, - "D": 0.5741333365440369 + "A": 0.023560721427202225, + "B": 0.00219148863106966, + "C": 0.1536351889371872, + "D": 0.7802239060401917 }, "sample": { "messages": [ @@ -52927,10 +52927,10 @@ ] }, "predict": { - "A": 0.06045512482523918, - "B": 0.23910465836524963, - "C": 0.09967364370822906, - "D": 0.5735822916030884 + "A": 0.031914178282022476, + "B": 0.014161830767989159, + "C": 0.18365341424942017, + "D": 0.7263633012771606 }, "sample": { "messages": [ @@ -52972,10 +52972,10 @@ ] }, "predict": { - "A": 0.35756805539131165, - "B": 0.168903186917305, - "C": 0.4051777124404907, - "D": 0.013864418491721153 + "A": 0.17404486238956451, + "B": 0.32515859603881836, + "C": 0.3684529662132263, + "D": 0.004093143157660961 }, "sample": { "messages": [ @@ -53017,10 +53017,10 @@ ] }, "predict": { - "A": 0.012254063971340656, - "B": 0.004508018493652344, - "C": 0.973460853099823, - "D": 0.00019806849013548344 + "A": 0.019889378920197487, + "B": 0.0018500004662200809, + "C": 0.9583239555358887, + "D": 9.804642468225211e-05 }, "sample": { "messages": [ @@ -53055,17 +53055,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "B", - "C" + "B" ] }, "predict": { - "A": 0.010275986976921558, - "B": 0.38560357689857483, - "C": 0.5610498785972595, - "D": 0.019198071211576462 + "A": 0.015109523199498653, + "B": 0.6424732208251953, + "C": 0.2678226828575134, + "D": 0.005916973575949669 }, "sample": { "messages": [ @@ -53095,7 +53095,7 @@ "prompt_len": 70, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " B" } } { @@ -53107,10 +53107,10 @@ ] }, "predict": { - "A": 0.03124420717358589, - "B": 0.024333013221621513, - "C": 0.004791454412043095, - "D": 0.9130895733833313 + "A": 0.09855388104915619, + "B": 0.031995758414268494, + "C": 0.004906708374619484, + "D": 0.8251815438270569 }, "sample": { "messages": [ @@ -53152,10 +53152,10 @@ ] }, "predict": { - "A": 0.006486848928034306, - "B": 0.01998089998960495, - "C": 0.9627337455749512, - "D": 0.001447411603294313 + "A": 0.002136439085006714, + "B": 0.0065806955099105835, + "C": 0.9766618013381958, + "D": 0.0010742689482867718 }, "sample": { "messages": [ @@ -53197,10 +53197,10 @@ ] }, "predict": { - "A": 0.959112823009491, - "B": 0.0014419677900150418, - "C": 0.00010445580119267106, - "D": 0.00011836392513941973 + "A": 0.9234208464622498, + "B": 0.00016580962983425707, + "C": 2.706767918425612e-05, + "D": 3.2649855711497366e-05 }, "sample": { "messages": [ @@ -53242,10 +53242,10 @@ ] }, "predict": { - "A": 0.0001771921233739704, - "B": 0.00012178225006209686, - "C": 1.867591345217079e-05, - "D": 0.9868118166923523 + "A": 3.7914836866548285e-05, + "B": 3.7914836866548285e-05, + "C": 1.4847665624984074e-05, + "D": 0.9463261961936951 }, "sample": { "messages": [ @@ -53287,10 +53287,10 @@ ] }, "predict": { - "A": 0.03806100785732269, - "B": 0.012356600724160671, - "C": 0.31868094205856323, - "D": 0.595374345779419 + "A": 0.009588449262082577, + "B": 0.0016662225825712085, + "C": 0.1499885618686676, + "D": 0.7617048025131226 }, "sample": { "messages": [ @@ -53332,10 +53332,10 @@ ] }, "predict": { - "A": 0.000707104685716331, - "B": 0.00013923717779107392, - "C": 0.9956775307655334, - "D": 0.0001228763721883297 + "A": 0.00017753026622813195, + "B": 2.402610880380962e-05, + "C": 0.988694965839386, + "D": 2.1202966308919713e-05 }, "sample": { "messages": [ @@ -53377,10 +53377,10 @@ ] }, "predict": { - "A": 0.013832714408636093, - "B": 0.0012866429751738906, - "C": 0.001872054417617619, - "D": 0.9697481989860535 + "A": 0.0034375351388007402, + "B": 0.00018218267359770834, + "C": 0.00036231352714821696, + "D": 0.9531331658363342 }, "sample": { "messages": [ @@ -53422,10 +53422,10 @@ ] }, "predict": { - "A": 0.17251385748386383, - "B": 0.018182827159762383, - "C": 0.7731534242630005, - "D": 0.0010258047841489315 + "A": 0.3583442270755768, + "B": 0.006563303526490927, + "C": 0.5213877558708191, + "D": 0.0007838748279027641 }, "sample": { "messages": [ @@ -53467,10 +53467,10 @@ ] }, "predict": { - "A": 0.0011421984527260065, - "B": 0.0016618891386315227, - "C": 0.0005743334768339992, - "D": 0.9755046367645264 + "A": 0.0006192217115312815, + "B": 0.00022779895516578108, + "C": 0.002030349802225828, + "D": 0.9281637072563171 }, "sample": { "messages": [ @@ -53512,10 +53512,10 @@ ] }, "predict": { - "A": 0.959340512752533, - "B": 0.0018519629957154393, - "C": 0.0006812990759499371, - "D": 6.337068043649197e-05 + "A": 0.8793683648109436, + "B": 0.0007076567853800952, + "C": 0.0005511235794983804, + "D": 7.006750092841685e-05 }, "sample": { "messages": [ @@ -53557,10 +53557,10 @@ ] }, "predict": { - "A": 0.0007762601599097252, - "B": 0.9646182656288147, - "C": 0.0007762601599097252, - "D": 0.0006850471836514771 + "A": 0.0006840047426521778, + "B": 0.8499772548675537, + "C": 0.0013603059342131019, + "D": 0.0014480381505563855 }, "sample": { "messages": [ @@ -53602,10 +53602,10 @@ ] }, "predict": { - "A": 0.001021422678604722, - "B": 0.0016840414609760046, - "C": 0.9885077476501465, - "D": 0.001486161258071661 + "A": 0.00033269048435613513, + "B": 0.00013028347166255116, + "C": 0.9917363524436951, + "D": 8.411732414970174e-05 }, "sample": { "messages": [ @@ -53647,10 +53647,10 @@ ] }, "predict": { - "A": 0.0007054521702229977, - "B": 0.0011630940716713667, - "C": 0.9933506846427917, - "D": 8.425425039604306e-05 + "A": 0.00035435453173704445, + "B": 0.00029377025202848017, + "C": 0.9923170804977417, + "D": 2.2653128326055594e-05 }, "sample": { "messages": [ @@ -53692,10 +53692,10 @@ ] }, "predict": { - "A": 0.8903046250343323, - "B": 0.02688487060368061, - "C": 0.02372581511735916, - "D": 0.02372581511735916 + "A": 0.9313017129898071, + "B": 0.0031552992295473814, + "C": 0.011723358184099197, + "D": 0.005537722259759903 }, "sample": { "messages": [ @@ -53737,10 +53737,10 @@ ] }, "predict": { - "A": 0.0006959968595765531, - "B": 0.9800365567207336, - "C": 5.0417842430761084e-05, - "D": 0.00019940643687732518 + "A": 0.0012197556206956506, + "B": 0.9193349480628967, + "C": 5.7048728194786236e-05, + "D": 0.00025567467673681676 }, "sample": { "messages": [ @@ -53782,10 +53782,10 @@ ] }, "predict": { - "A": 0.42043712735176086, - "B": 0.2550080120563507, - "C": 0.014386567287147045, - "D": 0.2889619469642639 + "A": 0.6813907027244568, + "B": 0.10449465364217758, + "C": 0.016024773940443993, + "D": 0.17228256165981293 }, "sample": { "messages": [ @@ -53827,10 +53827,10 @@ ] }, "predict": { - "A": 0.7997881770133972, - "B": 0.1082395613193512, - "C": 0.0017495296197012067, - "D": 0.024151511490345 + "A": 0.8466963171958923, + "B": 0.047767337411642075, + "C": 0.0009313147747889161, + "D": 0.006072934716939926 }, "sample": { "messages": [ @@ -53872,10 +53872,10 @@ ] }, "predict": { - "A": 0.23334845900535583, - "B": 0.013164618983864784, - "C": 0.7187638282775879, - "D": 0.01915440708398819 + "A": 0.33183133602142334, + "B": 0.003056060755625367, + "C": 0.6199425458908081, + "D": 0.006886939983814955 }, "sample": { "messages": [ @@ -53917,10 +53917,10 @@ ] }, "predict": { - "A": 0.0016572814201936126, - "B": 0.9728000164031982, - "C": 0.00025415231357328594, - "D": 0.01080683246254921 + "A": 0.0016220908146351576, + "B": 0.9521436095237732, + "C": 8.59676583786495e-05, + "D": 0.00343396607786417 }, "sample": { "messages": [ @@ -53962,10 +53962,10 @@ ] }, "predict": { - "A": 0.006365088280290365, - "B": 0.944662868976593, - "C": 0.017302103340625763, - "D": 0.003006654791533947 + "A": 0.002628618385642767, + "B": 0.9358528852462769, + "C": 0.0012416712706908584, + "D": 0.0008533873478882015 }, "sample": { "messages": [ @@ -54007,10 +54007,10 @@ ] }, "predict": { - "A": 0.031887736171483994, - "B": 0.009135990403592587, - "C": 0.9318962693214417, - "D": 0.013292786665260792 + "A": 0.028514424338936806, + "B": 0.0021987962536513805, + "C": 0.9442680478096008, + "D": 0.00925727840512991 }, "sample": { "messages": [ @@ -54052,10 +54052,10 @@ ] }, "predict": { - "A": 0.7683908939361572, - "B": 0.19427964091300964, - "C": 0.0035583560820668936, - "D": 0.0019046507077291608 + "A": 0.9065629243850708, + "B": 0.006921692751348019, + "C": 0.0009971644030883908, + "D": 0.00025212267064489424 }, "sample": { "messages": [ @@ -54097,10 +54097,10 @@ ] }, "predict": { - "A": 0.004949420690536499, - "B": 0.9431926012039185, - "C": 0.0007590182358399034, - "D": 0.0008600803557783365 + "A": 0.010978056117892265, + "B": 0.8720950484275818, + "C": 0.0007470661075785756, + "D": 0.00203073606826365 }, "sample": { "messages": [ @@ -54135,17 +54135,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "D" ] }, "predict": { - "A": 0.5349054932594299, - "B": 0.09295262396335602, - "C": 0.09295262396335602, - "D": 0.2229817658662796 + "A": 0.27424606680870056, + "B": 0.01753196306526661, + "C": 0.007308410480618477, + "D": 0.6578820943832397 }, "sample": { "messages": [ @@ -54175,7 +54175,7 @@ "prompt_len": 92, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -54187,10 +54187,10 @@ ] }, "predict": { - "A": 0.012029822915792465, - "B": 0.007296456024050713, - "C": 0.00643910001963377, - "D": 0.9556471705436707 + "A": 0.0008799797506071627, + "B": 0.00023684336338192225, + "C": 0.00044248162885196507, + "D": 0.9650149941444397 }, "sample": { "messages": [ @@ -54232,10 +54232,10 @@ ] }, "predict": { - "A": 0.019543560221791267, - "B": 0.028435714542865753, - "C": 0.941661536693573, - "D": 0.0011025721905753016 + "A": 0.015466379933059216, + "B": 0.012045228853821754, + "C": 0.9568710327148438, + "D": 0.00041216507088392973 }, "sample": { "messages": [ @@ -54277,10 +54277,10 @@ ] }, "predict": { - "A": 0.006521658040583134, - "B": 0.003080613212659955, - "C": 0.003955585416406393, - "D": 0.9678998589515686 + "A": 0.0019374850671738386, + "B": 0.000381513440515846, + "C": 0.0005908995517529547, + "D": 0.9428344964981079 }, "sample": { "messages": [ @@ -54322,10 +54322,10 @@ ] }, "predict": { - "A": 0.15623416006565094, - "B": 0.023959286510944366, - "C": 0.009987716563045979, - "D": 0.7934225797653198 + "A": 0.0523468442261219, + "B": 0.0026061958633363247, + "C": 0.00048209773376584053, + "D": 0.927869975566864 }, "sample": { "messages": [ @@ -54367,10 +54367,10 @@ ] }, "predict": { - "A": 0.01388244703412056, - "B": 0.005107067059725523, - "C": 0.9732347130775452, - "D": 0.0003938147274311632 + "A": 0.005177606828510761, + "B": 0.00027440310805104673, + "C": 0.9866771697998047, + "D": 4.2081075662281364e-05 }, "sample": { "messages": [ @@ -54412,10 +54412,10 @@ ] }, "predict": { - "A": 0.44570043683052063, - "B": 0.44570043683052063, - "C": 0.025144698098301888, - "D": 0.041456595063209534 + "A": 0.8250588774681091, + "B": 0.0869605615735054, + "C": 0.010385958477854729, + "D": 0.008610262535512447 }, "sample": { "messages": [ @@ -54457,10 +54457,10 @@ ] }, "predict": { - "A": 0.06665949523448944, - "B": 0.9202058911323547, - "C": 0.0009508465882390738, - "D": 0.0008391191950067878 + "A": 0.011859065853059292, + "B": 0.9420822858810425, + "C": 0.001103064976632595, + "D": 0.0012499364092946053 }, "sample": { "messages": [ @@ -54502,10 +54502,10 @@ ] }, "predict": { - "A": 0.48729532957077026, - "B": 0.03999963775277138, - "C": 0.43003663420677185, - "D": 0.0037205456756055355 + "A": 0.7469393014907837, + "B": 0.002693883841857314, + "C": 0.10108724236488342, + "D": 0.0004983184044249356 }, "sample": { "messages": [ @@ -54547,10 +54547,10 @@ ] }, "predict": { - "A": 0.7681205868721008, - "B": 0.09173892438411713, - "C": 0.02978326752781868, - "D": 0.07144634425640106 + "A": 0.9140210151672363, + "B": 0.024357834830880165, + "C": 0.0069786361418664455, + "D": 0.021495714783668518 }, "sample": { "messages": [ @@ -54585,17 +54585,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.06513772904872894, - "B": 0.8991985321044922, - "C": 0.004718561191111803, - "D": 0.008815432898700237 + "A": 0.7247716784477234, + "B": 0.2076505720615387, + "C": 0.004048541653901339, + "D": 0.009123529307544231 }, "sample": { "messages": [ @@ -54625,7 +54625,7 @@ "prompt_len": 75, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -54637,10 +54637,10 @@ ] }, "predict": { - "A": 0.04055103287100792, - "B": 0.014917891472578049, - "C": 0.922937273979187, - "D": 0.011618065647780895 + "A": 0.022405849769711494, + "B": 0.0026759973261505365, + "C": 0.9527209401130676, + "D": 0.011992987245321274 }, "sample": { "messages": [ @@ -54675,17 +54675,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.24643956124782562, - "B": 0.5217125415802002, - "C": 0.004513697698712349, - "D": 0.19192732870578766 + "A": 0.6106535196304321, + "B": 0.2246468961238861, + "C": 0.001254867180250585, + "D": 0.08264296501874924 }, "sample": { "messages": [ @@ -54715,7 +54715,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -54727,10 +54727,10 @@ ] }, "predict": { - "A": 0.008332725614309311, - "B": 0.9631320834159851, - "C": 0.0003230948350392282, - "D": 0.015567580237984657 + "A": 0.010335998609662056, + "B": 0.9304168820381165, + "C": 0.0005145990289747715, + "D": 0.011712220497429371 }, "sample": { "messages": [ @@ -54772,10 +54772,10 @@ ] }, "predict": { - "A": 0.0003697374486364424, - "B": 6.42507293378003e-05, - "C": 0.0004747523053083569, - "D": 0.972663164138794 + "A": 8.789273852016777e-05, + "B": 7.214674951683264e-06, + "C": 0.0001980693341465667, + "D": 0.91448575258255 }, "sample": { "messages": [ @@ -54817,10 +54817,10 @@ ] }, "predict": { - "A": 0.00042672562994994223, - "B": 0.0007972284220159054, - "C": 0.9906744360923767, - "D": 0.0003323342534713447 + "A": 0.00014700624160468578, + "B": 0.00029235685360617936, + "C": 0.9875427484512329, + "D": 9.491436503594741e-05 }, "sample": { "messages": [ @@ -54862,10 +54862,10 @@ ] }, "predict": { - "A": 0.68586266040802, - "B": 0.012562013231217861, - "C": 0.25231480598449707, - "D": 0.003599076997488737 + "A": 0.5192870497703552, + "B": 0.013838531449437141, + "C": 0.2779543399810791, + "D": 0.0024047764018177986 }, "sample": { "messages": [ @@ -54907,10 +54907,10 @@ ] }, "predict": { - "A": 0.05244103819131851, - "B": 0.9295396208763123, - "C": 0.0010883789509534836, - "D": 0.0020333596039563417 + "A": 0.05062078312039375, + "B": 0.8972748517990112, + "C": 0.0006783188437111676, + "D": 0.002855829196050763 }, "sample": { "messages": [ @@ -54952,10 +54952,10 @@ ] }, "predict": { - "A": 0.30265742540359497, - "B": 0.6407257914543152, - "C": 0.010356367565691471, - "D": 0.008065546862781048 + "A": 0.2806990444660187, + "B": 0.5942398905754089, + "C": 0.010883883573114872, + "D": 0.004537077154964209 }, "sample": { "messages": [ @@ -54997,10 +54997,10 @@ ] }, "predict": { - "A": 0.0005389309953898191, - "B": 0.9744101166725159, - "C": 0.0003704015107359737, - "D": 0.007439712528139353 + "A": 0.0006401872378773987, + "B": 0.9014507532119751, + "C": 0.0004133359470870346, + "D": 0.005360221955925226 }, "sample": { "messages": [ @@ -55042,10 +55042,10 @@ ] }, "predict": { - "A": 0.0018547987565398216, - "B": 0.02560470625758171, - "C": 0.9608095288276672, - "D": 0.0014445186825469136 + "A": 0.0005105177988298237, + "B": 0.007502003572881222, + "C": 0.982568621635437, + "D": 0.00047958712093532085 }, "sample": { "messages": [ @@ -55087,10 +55087,10 @@ ] }, "predict": { - "A": 0.013849534094333649, - "B": 0.9709272980690002, - "C": 0.00047390503459610045, - "D": 0.0004182196862529963 + "A": 0.021643130108714104, + "B": 0.9202892780303955, + "C": 0.0003723906120285392, + "D": 0.0003964077332057059 }, "sample": { "messages": [ @@ -55132,10 +55132,10 @@ ] }, "predict": { - "A": 0.0019150003790855408, - "B": 0.0007044892408885062, - "C": 0.9919947385787964, - "D": 0.001161506399512291 + "A": 0.0004277286643628031, + "B": 0.0001783038314897567, + "C": 0.9930030703544617, + "D": 0.0002150754735339433 }, "sample": { "messages": [ @@ -55177,10 +55177,10 @@ ] }, "predict": { - "A": 0.8953479528427124, - "B": 0.044576749205589294, - "C": 0.006032806821167469, - "D": 0.005323933437466621 + "A": 0.8187089562416077, + "B": 0.02801467850804329, + "C": 0.0027738288044929504, + "D": 0.0033458764664828777 }, "sample": { "messages": [ @@ -55222,10 +55222,10 @@ ] }, "predict": { - "A": 0.8969885110855103, - "B": 0.05734255537390709, - "C": 0.012794854119420052, - "D": 0.00996464304625988 + "A": 0.6623005867004395, + "B": 0.06980596482753754, + "C": 0.08963263779878616, + "D": 0.009447210468351841 }, "sample": { "messages": [ @@ -55260,17 +55260,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.4260156452655792, - "B": 0.0011240924941375852, - "C": 0.0017410286236554384, - "D": 0.5470148921012878 + "A": 0.7994922995567322, + "B": 0.00017316278535872698, + "C": 0.0022456094156950712, + "D": 0.17839084565639496 }, "sample": { "messages": [ @@ -55300,22 +55300,22 @@ "prompt_len": 61, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.010355344973504543, - "B": 0.7259657382965088, - "C": 0.2356865555047989, - "D": 0.000904840009752661 + "A": 0.0047297729179263115, + "B": 0.07398609071969986, + "C": 0.9013351202011108, + "D": 0.0003218650526832789 }, "sample": { "messages": [ @@ -55345,7 +55345,7 @@ "prompt_len": 68, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -55357,10 +55357,10 @@ ] }, "predict": { - "A": 0.08238869905471802, - "B": 0.011150098405778408, - "C": 0.002487923251464963, - "D": 0.8857619762420654 + "A": 0.10014727711677551, + "B": 0.008220589719712734, + "C": 0.006402200553566217, + "D": 0.8385228514671326 }, "sample": { "messages": [ @@ -55395,17 +55395,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "B", - "D" + "B" ] }, "predict": { - "A": 0.1483314335346222, - "B": 0.19046133756637573, - "C": 0.06183374300599098, - "D": 0.586662232875824 + "A": 0.14284548163414001, + "B": 0.5649648904800415, + "C": 0.04637514054775238, + "D": 0.2078389823436737 }, "sample": { "messages": [ @@ -55435,7 +55435,7 @@ "prompt_len": 69, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " B" } } { @@ -55447,10 +55447,10 @@ ] }, "predict": { - "A": 0.003941038157790899, - "B": 0.002708633430302143, - "C": 0.006497673690319061, - "D": 0.9643402695655823 + "A": 0.003177517093718052, + "B": 0.00045776477782055736, + "C": 0.004343151114881039, + "D": 0.9378594756126404 }, "sample": { "messages": [ @@ -55492,10 +55492,10 @@ ] }, "predict": { - "A": 0.012166386470198631, - "B": 0.9664957523345947, - "C": 0.007379285525530577, - "D": 0.0007777710561640561 + "A": 0.004901121836155653, + "B": 0.9339884519577026, + "C": 0.002043091459199786, + "D": 0.0008000861271284521 }, "sample": { "messages": [ @@ -55537,10 +55537,10 @@ ] }, "predict": { - "A": 0.019964195787906647, - "B": 0.6611233353614807, - "C": 0.2432137131690979, - "D": 0.03729802742600441 + "A": 0.02224060334265232, + "B": 0.5735927224159241, + "C": 0.3070220947265625, + "D": 0.03235989063978195 }, "sample": { "messages": [ @@ -55582,10 +55582,10 @@ ] }, "predict": { - "A": 0.0016640768153592944, - "B": 0.002743598772212863, - "C": 0.00042074447264894843, - "D": 0.9767887592315674 + "A": 0.0020749010145664215, + "B": 0.0005584520404227078, + "C": 0.00021869294869247824, + "D": 0.9485300183296204 }, "sample": { "messages": [ @@ -55627,10 +55627,10 @@ ] }, "predict": { - "A": 0.032012976706027985, - "B": 0.0986069068312645, - "C": 0.8256255388259888, - "D": 0.01512185949832201 + "A": 0.0020960872061550617, + "B": 0.012062148191034794, + "C": 0.9582151174545288, + "D": 0.002691429341211915 }, "sample": { "messages": [ @@ -55672,10 +55672,10 @@ ] }, "predict": { - "A": 0.9211716055870056, - "B": 0.03571769967675209, - "C": 0.003764617955312133, - "D": 0.010233293287456036 + "A": 0.8157585263252258, + "B": 0.027913721278309822, + "C": 0.00428070779889822, + "D": 0.021739227697253227 }, "sample": { "messages": [ @@ -55717,10 +55717,10 @@ ] }, "predict": { - "A": 0.001161907333880663, - "B": 0.0005488461465574801, - "C": 0.992337167263031, - "D": 5.784795939689502e-05 + "A": 0.00025908753741532564, + "B": 5.43077003385406e-05, + "C": 0.9916901588439941, + "D": 1.3731137187278364e-05 }, "sample": { "messages": [ @@ -55755,17 +55755,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "B" + "D" ] }, "predict": { - "A": 0.05819302052259445, - "B": 0.5521200299263, - "C": 0.2031136304140091, - "D": 0.15818503499031067 + "A": 0.09111736714839935, + "B": 0.1502271443605423, + "C": 0.24768270552158356, + "D": 0.462732195854187 }, "sample": { "messages": [ @@ -55795,22 +55795,22 @@ "prompt_len": 81, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.22797639667987823, - "B": 0.4259159564971924, - "C": 0.1566857397556305, - "D": 0.1566857397556305 + "A": 0.38983088731765747, + "B": 0.143410786986351, + "C": 0.031999267637729645, + "D": 0.38983088731765747 }, "sample": { "messages": [ @@ -55840,7 +55840,7 @@ "prompt_len": 97, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -55852,10 +55852,10 @@ ] }, "predict": { - "A": 0.0014718599850311875, - "B": 0.9789953827857971, - "C": 0.000541467044968158, - "D": 0.0012989118695259094 + "A": 0.0013749186182394624, + "B": 0.9145156741142273, + "C": 0.0010707876645028591, + "D": 0.0010707876645028591 }, "sample": { "messages": [ @@ -55897,10 +55897,10 @@ ] }, "predict": { - "A": 0.005357095506042242, - "B": 0.17740264534950256, - "C": 0.7950634360313416, - "D": 0.0010548746213316917 + "A": 0.0007940604700706899, + "B": 0.004569502547383308, + "C": 0.9867377877235413, + "D": 0.00024217477766796947 }, "sample": { "messages": [ @@ -55942,10 +55942,10 @@ ] }, "predict": { - "A": 0.03491715341806412, - "B": 0.7947106957435608, - "C": 0.001534152659587562, - "D": 0.15648780763149261 + "A": 0.03526749834418297, + "B": 0.8026845455169678, + "C": 0.0006068107904866338, + "D": 0.13948564231395721 }, "sample": { "messages": [ @@ -55987,10 +55987,10 @@ ] }, "predict": { - "A": 0.7356350421905518, - "B": 0.18599766492843628, - "C": 0.041501689702272415, - "D": 0.009260278195142746 + "A": 0.7729402184486389, + "B": 0.014156893827021122, + "C": 0.15220093727111816, + "D": 0.0021710300352424383 }, "sample": { "messages": [ @@ -56032,10 +56032,10 @@ ] }, "predict": { - "A": 0.1917898803949356, - "B": 0.7585436701774597, - "C": 0.003512754337862134, - "D": 0.029411930590867996 + "A": 0.25570812821388245, + "B": 0.6950867176055908, + "C": 0.0017229478107765317, + "D": 0.00874985009431839 }, "sample": { "messages": [ @@ -56077,10 +56077,10 @@ ] }, "predict": { - "A": 5.807502748211846e-05, - "B": 3.5224285966251045e-05, - "C": 0.9962323307991028, - "D": 0.00017888368165586144 + "A": 3.288931111455895e-05, + "B": 7.3385981522733346e-06, + "C": 0.9901857376098633, + "D": 6.144532380858436e-05 }, "sample": { "messages": [ @@ -56122,10 +56122,10 @@ ] }, "predict": { - "A": 0.9499421715736389, - "B": 0.02868576906621456, - "C": 0.0011122679570689797, - "D": 0.003882196731865406 + "A": 0.92906653881073, + "B": 0.002609557006508112, + "C": 0.0004534730105660856, + "D": 0.0003116670995950699 }, "sample": { "messages": [ @@ -56160,17 +56160,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "A" + "D" ] }, "predict": { - "A": 0.49588078260421753, - "B": 0.09764470905065536, - "C": 0.23423750698566437, - "D": 0.1253782957792282 + "A": 0.04188782349228859, + "B": 0.007279012817889452, + "C": 0.06094643101096153, + "D": 0.8413394689559937 }, "sample": { "messages": [ @@ -56200,7 +56200,7 @@ "prompt_len": 74, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -56212,10 +56212,10 @@ ] }, "predict": { - "A": 0.383367121219635, - "B": 0.2052016407251358, - "C": 0.1244610846042633, - "D": 0.18108981847763062 + "A": 0.35424381494522095, + "B": 0.06975473463535309, + "C": 0.16733293235301971, + "D": 0.27588534355163574 }, "sample": { "messages": [ @@ -56257,10 +56257,10 @@ ] }, "predict": { - "A": 0.0021239430643618107, - "B": 0.9709493517875671, - "C": 0.0007813549600541592, - "D": 0.012222448363900185 + "A": 0.005122557282447815, + "B": 0.8092871904373169, + "C": 0.0012167140375822783, + "D": 0.1406330168247223 }, "sample": { "messages": [ @@ -56302,10 +56302,10 @@ ] }, "predict": { - "A": 0.0024346571881324053, - "B": 0.9822106957435608, - "C": 0.00029077832004986703, - "D": 0.0006155776209197938 + "A": 0.002034544013440609, + "B": 0.9300810098648071, + "C": 0.00031200741068460047, + "D": 0.0007031195564195514 }, "sample": { "messages": [ @@ -56347,10 +56347,10 @@ ] }, "predict": { - "A": 0.6015467047691345, - "B": 0.07184451073408127, - "C": 0.2841505706310272, - "D": 0.018165137618780136 + "A": 0.7773517370223999, + "B": 0.009785414673388004, + "C": 0.1530696302652359, + "D": 0.007159161847084761 }, "sample": { "messages": [ @@ -56392,10 +56392,10 @@ ] }, "predict": { - "A": 0.012329942546784878, - "B": 0.0018908579368144274, - "C": 0.9794886112213135, - "D": 0.0014726015506312251 + "A": 0.005873534828424454, + "B": 0.0003313621855340898, + "C": 0.9877767562866211, + "D": 0.00012976329890079796 }, "sample": { "messages": [ @@ -56437,10 +56437,10 @@ ] }, "predict": { - "A": 0.9441896080970764, - "B": 0.015261403284966946, - "C": 0.006361899431794882, - "D": 0.008168840780854225 + "A": 0.9261515140533447, + "B": 0.0026013690512627363, + "C": 0.0026013690512627363, + "D": 0.008012780919671059 }, "sample": { "messages": [ @@ -56482,10 +56482,10 @@ ] }, "predict": { - "A": 0.36012500524520874, - "B": 0.593745768070221, - "C": 0.009597006253898144, - "D": 0.02031686156988144 + "A": 0.1901066154241562, + "B": 0.7518863081932068, + "C": 0.0011304152430966496, + "D": 0.029153794050216675 }, "sample": { "messages": [ @@ -56527,10 +56527,10 @@ ] }, "predict": { - "A": 0.001257271389476955, - "B": 0.9476107954978943, - "C": 0.0034176181070506573, - "D": 0.025252971798181534 + "A": 0.0012005509342998266, + "B": 0.9048603177070618, + "C": 0.00270548346452415, + "D": 0.011390510946512222 }, "sample": { "messages": [ @@ -56572,10 +56572,10 @@ ] }, "predict": { - "A": 0.9584502577781677, - "B": 0.0014409717405214906, - "C": 0.0001518772915005684, - "D": 0.00041284531471319497 + "A": 0.9040998220443726, + "B": 0.0001048148114932701, + "C": 2.063925967377145e-05, + "D": 7.203809218481183e-05 }, "sample": { "messages": [ @@ -56617,10 +56617,10 @@ ] }, "predict": { - "A": 0.008257078938186169, - "B": 0.00935649685561657, - "C": 0.012013979256153107, - "D": 0.9543885588645935 + "A": 0.008366349153220654, + "B": 0.0008818067144602537, + "C": 0.0027161561883985996, + "D": 0.9670184850692749 }, "sample": { "messages": [ @@ -56655,17 +56655,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "D" ] }, "predict": { - "A": 0.5183884501457214, - "B": 0.0425519160926342, - "C": 0.06191267445683479, - "D": 0.3562828600406647 + "A": 0.4271753430366516, + "B": 0.007823989726603031, + "C": 0.027308408170938492, + "D": 0.4840530753135681 }, "sample": { "messages": [ @@ -56695,7 +56695,7 @@ "prompt_len": 82, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -56707,10 +56707,10 @@ ] }, "predict": { - "A": 0.07354125380516052, - "B": 0.8959158658981323, - "C": 0.00034056356525979936, - "D": 0.021069921553134918 + "A": 0.09332747012376785, + "B": 0.885466456413269, + "C": 0.00015899453137535602, + "D": 0.0021948518697172403 }, "sample": { "messages": [ @@ -56752,10 +56752,10 @@ ] }, "predict": { - "A": 0.2844942510128021, - "B": 0.07193141430616379, - "C": 0.6022742986679077, - "D": 0.0006223286036401987 + "A": 0.3830697536468506, + "B": 0.027749480679631233, + "C": 0.43407490849494934, + "D": 0.0001990333548747003 }, "sample": { "messages": [ @@ -56797,10 +56797,10 @@ ] }, "predict": { - "A": 0.000764850527048111, - "B": 4.314991383580491e-05, - "C": 0.0004093949683010578, - "D": 0.9504401087760925 + "A": 0.0025310672353953123, + "B": 4.635811274056323e-05, + "C": 0.0005647573852911592, + "D": 0.9011223316192627 }, "sample": { "messages": [ @@ -56842,10 +56842,10 @@ ] }, "predict": { - "A": 0.1531602442264557, - "B": 0.17355330288410187, - "C": 0.6057605147361755, - "D": 0.020727984607219696 + "A": 0.35130900144577026, + "B": 0.03702769801020622, + "C": 0.5111515522003174, + "D": 0.05387497693300247 }, "sample": { "messages": [ @@ -56887,10 +56887,10 @@ ] }, "predict": { - "A": 0.14280307292938232, - "B": 0.004312279634177685, - "C": 0.005537076387554407, - "D": 0.8217750191688538 + "A": 0.1948661506175995, + "B": 0.0009024092578329146, + "C": 0.0027796162758022547, + "D": 0.7707105875015259 }, "sample": { "messages": [ @@ -56925,17 +56925,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "B" + "D" ] }, "predict": { - "A": 0.00039410567842423916, - "B": 0.4897347092628479, - "C": 0.00021094956900924444, - "D": 0.4897347092628479 + "A": 0.0005359024507924914, + "B": 0.15817417204380035, + "C": 0.00026946867001242936, + "D": 0.8032747507095337 }, "sample": { "messages": [ @@ -56977,10 +56977,10 @@ ] }, "predict": { - "A": 0.006480473559349775, - "B": 0.9617875218391418, - "C": 0.002384033054113388, - "D": 0.0018566867802292109 + "A": 0.005508813075721264, + "B": 0.9264399409294128, + "C": 0.0024445210583508015, + "D": 0.001308457925915718 }, "sample": { "messages": [ @@ -57022,10 +57022,10 @@ ] }, "predict": { - "A": 0.9194361567497253, - "B": 0.011573992669582367, - "C": 0.035650406032800674, - "D": 0.00034950432018376887 + "A": 0.8632524013519287, + "B": 0.0031133743468672037, + "C": 0.015811020508408546, + "D": 0.00016500278434250504 }, "sample": { "messages": [ @@ -57067,10 +57067,10 @@ ] }, "predict": { - "A": 0.010121060535311699, - "B": 0.06599760055541992, - "C": 0.9110687375068665, - "D": 0.0007331671076826751 + "A": 0.0016715325182303786, + "B": 0.007491288240998983, + "C": 0.9811651706695557, + "D": 0.0002262173074996099 }, "sample": { "messages": [ @@ -57112,10 +57112,10 @@ ] }, "predict": { - "A": 0.00960854534059763, - "B": 0.0024294208269566298, - "C": 0.9800982475280762, - "D": 0.00010027416283264756 + "A": 0.005156179424375296, + "B": 0.00045054193469695747, + "C": 0.982593834400177, + "D": 2.8802181986975484e-05 }, "sample": { "messages": [ @@ -57157,10 +57157,10 @@ ] }, "predict": { - "A": 0.0007828539237380028, - "B": 0.9728119373321533, - "C": 0.00028799587744288146, - "D": 0.00013603961269836873 + "A": 0.00044002829235978425, + "B": 0.9015213251113892, + "C": 0.00016187735309358686, + "D": 7.183263369370252e-05 }, "sample": { "messages": [ @@ -57198,14 +57198,14 @@ "acc": false, "f1_macro": [ "A", - "B" + "C" ] }, "predict": { - "A": 0.3311348557472229, - "B": 0.3752249479293823, - "C": 0.0837240070104599, - "D": 0.17724372446537018 + "A": 0.2807141840457916, + "B": 0.13259997963905334, + "C": 0.40843671560287476, + "D": 0.09113454818725586 }, "sample": { "messages": [ @@ -57235,22 +57235,22 @@ "prompt_len": 104, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.3976139724254608, - "B": 0.5785248875617981, - "C": 0.0038980699609965086, - "D": 0.001624957425519824 + "A": 0.7340853214263916, + "B": 0.18560583889484406, + "C": 0.00436503067612648, + "D": 0.0005213285912759602 }, "sample": { "messages": [ @@ -57280,7 +57280,7 @@ "prompt_len": 80, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -57292,10 +57292,10 @@ ] }, "predict": { - "A": 0.0016563740791752934, - "B": 0.0006904793553985655, - "C": 0.00016400322783738375, - "D": 0.9722673296928406 + "A": 0.0016409172676503658, + "B": 0.0001346946955891326, + "C": 0.00011886763968504965, + "D": 0.9631944894790649 }, "sample": { "messages": [ @@ -57337,10 +57337,10 @@ ] }, "predict": { - "A": 0.017586365342140198, - "B": 0.0008755735470913351, - "C": 0.0008755735470913351, - "D": 0.9601829648017883 + "A": 0.011665619909763336, + "B": 0.00013795137056149542, + "C": 0.00022744339366909117, + "D": 0.9267149567604065 }, "sample": { "messages": [ @@ -57382,10 +57382,10 @@ ] }, "predict": { - "A": 0.001666102558374405, - "B": 0.9779778718948364, - "C": 0.00032807502429932356, - "D": 0.009587758220732212 + "A": 0.005646239500492811, + "B": 0.949551522731781, + "C": 0.0007641355041414499, + "D": 0.019707312807440758 }, "sample": { "messages": [ @@ -57427,10 +57427,10 @@ ] }, "predict": { - "A": 0.03509298712015152, - "B": 0.03509298712015152, - "C": 0.9050600528717041, - "D": 0.012909987941384315 + "A": 0.015558662824332714, + "B": 0.004457632079720497, + "C": 0.9625803828239441, + "D": 0.0004698309348896146 }, "sample": { "messages": [ @@ -57472,10 +57472,10 @@ ] }, "predict": { - "A": 0.0011321213096380234, - "B": 0.00947913620620966, - "C": 0.00034527748357504606, - "D": 0.9668981432914734 + "A": 0.0003169085830450058, + "B": 0.0008614468388259411, + "C": 0.00010288515477441251, + "D": 0.9446911215782166 }, "sample": { "messages": [ @@ -57510,17 +57510,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "C" + "D" ] }, "predict": { - "A": 0.12791761755943298, - "B": 0.06042400375008583, - "C": 0.5059239864349365, - "D": 0.2708016037940979 + "A": 0.06215948611497879, + "B": 0.01386965624988079, + "C": 0.40533074736595154, + "D": 0.4592999219894409 }, "sample": { "messages": [ @@ -57550,7 +57550,7 @@ "prompt_len": 75, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " D" } } { @@ -57562,10 +57562,10 @@ ] }, "predict": { - "A": 0.0010212933411821723, - "B": 0.9883825182914734, - "C": 0.00012197608884889632, - "D": 0.00048242483171634376 + "A": 0.0006775970105081797, + "B": 0.9541276097297668, + "C": 5.225067070568912e-05, + "D": 0.000465705175884068 }, "sample": { "messages": [ @@ -57607,10 +57607,10 @@ ] }, "predict": { - "A": 0.9660221338272095, - "B": 0.006509006023406982, - "C": 0.0006054318509995937, - "D": 0.0030746369156986475 + "A": 0.8896012306213379, + "B": 0.0017173344967886806, + "C": 0.0007158914813771844, + "D": 0.0010416159639135003 }, "sample": { "messages": [ @@ -57652,10 +57652,10 @@ ] }, "predict": { - "A": 0.9313163757324219, - "B": 0.013284514658153057, - "C": 0.001400177483446896, - "D": 0.013284514658153057 + "A": 0.7564389109611511, + "B": 0.008403277955949306, + "C": 0.0018750246381387115, + "D": 0.008403277955949306 }, "sample": { "messages": [ @@ -57697,10 +57697,10 @@ ] }, "predict": { - "A": 0.10322049260139465, - "B": 0.008472854271531105, - "C": 0.005139045417308807, - "D": 0.8642545938491821 + "A": 0.02541361190378666, + "B": 0.0012652692385017872, + "C": 0.002516288310289383, + "D": 0.9536387324333191 }, "sample": { "messages": [ @@ -57742,10 +57742,10 @@ ] }, "predict": { - "A": 0.13045719265937805, - "B": 0.8506877422332764, - "C": 0.0004705020983237773, - "D": 0.004463999532163143 + "A": 0.12624555826187134, + "B": 0.8232243657112122, + "C": 0.00033311377046629786, + "D": 0.005546842236071825 }, "sample": { "messages": [ @@ -57787,10 +57787,10 @@ ] }, "predict": { - "A": 0.8889161348342896, - "B": 0.005989469587802887, - "C": 0.004116498399525881, - "D": 0.07296667993068695 + "A": 0.8566159605979919, + "B": 0.0010029941331595182, + "C": 0.005093624349683523, + "D": 0.005093624349683523 }, "sample": { "messages": [ @@ -57832,10 +57832,10 @@ ] }, "predict": { - "A": 0.4183615744113922, - "B": 0.22393281757831573, - "C": 0.1358221173286438, - "D": 0.17439904808998108 + "A": 0.5164744257926941, + "B": 0.07920392602682114, + "C": 0.1016998440027237, + "D": 0.19000035524368286 }, "sample": { "messages": [ @@ -57877,10 +57877,10 @@ ] }, "predict": { - "A": 0.028837867081165314, - "B": 0.9549790024757385, - "C": 0.0005281839403323829, - "D": 0.0012670473661273718 + "A": 0.019342651590704918, + "B": 0.9319812059402466, + "C": 0.0005840974627062678, + "D": 0.0012365344446152449 }, "sample": { "messages": [ @@ -57922,10 +57922,10 @@ ] }, "predict": { - "A": 0.003525193314999342, - "B": 0.9774383902549744, - "C": 0.0002117043040925637, - "D": 0.002422827761620283 + "A": 0.0032339782919734716, + "B": 0.8966925144195557, + "C": 8.618260471848771e-05, + "D": 0.0004959466168656945 }, "sample": { "messages": [ @@ -57967,10 +57967,10 @@ ] }, "predict": { - "A": 0.009468276984989643, - "B": 0.9657905101776123, - "C": 0.0023939553648233414, - "D": 0.00835572462528944 + "A": 0.009236269630491734, + "B": 0.9421250820159912, + "C": 0.0020608901977539062, + "D": 0.004943818785250187 }, "sample": { "messages": [ @@ -58012,10 +58012,10 @@ ] }, "predict": { - "A": 0.002408586675301194, - "B": 0.002408586675301194, - "C": 0.0006482623284682631, - "D": 0.971693217754364 + "A": 0.00039981459849514067, + "B": 0.0008464074926450849, + "C": 0.0019074087031185627, + "D": 0.928198516368866 }, "sample": { "messages": [ @@ -58057,10 +58057,10 @@ ] }, "predict": { - "A": 0.0001225689338753, - "B": 8.424030966125429e-05, - "C": 0.00037753890501335263, - "D": 0.993186354637146 + "A": 3.900904266629368e-05, + "B": 2.681050136743579e-05, + "C": 0.00012015632091788575, + "D": 0.9736367464065552 }, "sample": { "messages": [ @@ -58102,10 +58102,10 @@ ] }, "predict": { - "A": 0.9554818272590637, - "B": 0.005013908725231886, - "C": 0.0009872971568256617, - "D": 0.008266537450253963 + "A": 0.9033231139183044, + "B": 0.000343379273544997, + "C": 0.00030303114908747375, + "D": 0.0004996138741262257 }, "sample": { "messages": [ @@ -58147,10 +58147,10 @@ ] }, "predict": { - "A": 0.0007978330249898136, - "B": 0.0011608401546254754, - "C": 0.9914257526397705, - "D": 0.0002757237234618515 + "A": 0.00042595344712026417, + "B": 0.00035312780528329313, + "C": 0.9888817667961121, + "D": 7.879346230765805e-05 }, "sample": { "messages": [ @@ -58185,17 +58185,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.35946983098983765, - "B": 0.5926655530929565, - "C": 0.009579546749591827, - "D": 0.008453919552266598 + "A": 0.8379254341125488, + "B": 0.02867223136126995, + "C": 0.010547924786806107, + "D": 0.004397029057145119 }, "sample": { "messages": [ @@ -58225,7 +58225,7 @@ "prompt_len": 66, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -58237,10 +58237,10 @@ ] }, "predict": { - "A": 0.09060599654912949, - "B": 0.7586347460746765, - "C": 0.11634040623903275, - "D": 0.003100366797298193 + "A": 0.1323304921388626, + "B": 0.5930641293525696, + "C": 0.21817612648010254, + "D": 0.0022768720518797636 }, "sample": { "messages": [ @@ -58282,10 +58282,10 @@ ] }, "predict": { - "A": 0.0017673491965979338, - "B": 0.0663192942738533, - "C": 0.9155095219612122, - "D": 0.0005737742176279426 + "A": 0.0012858089758083224, + "B": 0.012199416756629944, + "C": 0.969119668006897, + "D": 0.00022344010358210653 }, "sample": { "messages": [ @@ -58320,17 +58320,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "C" ] }, "predict": { - "A": 0.08224935829639435, - "B": 0.03428663685917854, - "C": 0.2533458471298218, - "D": 0.6077451109886169 + "A": 0.21961380541324615, + "B": 0.00851535052061081, + "C": 0.5969721674919128, + "D": 0.13320249319076538 }, "sample": { "messages": [ @@ -58360,22 +58360,22 @@ "prompt_len": 81, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " C" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.3361356258392334, - "B": 0.3808915615081787, - "C": 0.140122190117836, - "D": 0.07500199228525162 + "A": 0.7161351442337036, + "B": 0.20517615973949432, + "C": 0.03565426915884018, + "D": 0.004825280513614416 }, "sample": { "messages": [ @@ -58405,7 +58405,7 @@ "prompt_len": 71, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -58417,10 +58417,10 @@ ] }, "predict": { - "A": 0.33917686343193054, - "B": 0.1247762143611908, - "C": 0.2057211846113205, - "D": 0.2641512155532837 + "A": 0.3649898171424866, + "B": 0.055973004549741745, + "C": 0.1342722624540329, + "D": 0.28425437211990356 }, "sample": { "messages": [ @@ -58462,10 +58462,10 @@ ] }, "predict": { - "A": 0.7613903284072876, - "B": 0.14992663264274597, - "C": 0.02605334296822548, - "D": 0.01790618523955345 + "A": 0.9079663157463074, + "B": 0.02135332114994526, + "C": 0.003949967212975025, + "D": 0.0129514429718256 }, "sample": { "messages": [ @@ -58507,10 +58507,10 @@ ] }, "predict": { - "A": 0.5409486889839172, - "B": 0.008743621408939362, - "C": 0.3281019926071167, - "D": 0.0829571783542633 + "A": 0.779566764831543, + "B": 0.00193235301412642, + "C": 0.07251100242137909, + "D": 0.08216573297977448 }, "sample": { "messages": [ @@ -58552,10 +58552,10 @@ ] }, "predict": { - "A": 0.0006158670294098556, - "B": 0.9826724529266357, - "C": 3.474484401522204e-05, - "D": 0.00017644886975176632 + "A": 0.0007100513903424144, + "B": 0.9392502903938293, + "C": 3.119747634627856e-05, + "D": 8.480354154016823e-05 }, "sample": { "messages": [ @@ -58597,10 +58597,10 @@ ] }, "predict": { - "A": 0.9474256634712219, - "B": 0.015313709154725075, - "C": 0.0026611238718032837, - "D": 0.0018289617728441954 + "A": 0.7687796354293823, + "B": 0.0027726523112505674, + "C": 0.0024468570481985807, + "D": 0.0005459676031023264 }, "sample": { "messages": [ @@ -58642,10 +58642,10 @@ ] }, "predict": { - "A": 0.0012863569427281618, - "B": 0.9695326685905457, - "C": 0.005087640602141619, - "D": 0.0008840993396006525 + "A": 0.001504774671047926, + "B": 0.8832806944847107, + "C": 0.005252178758382797, + "D": 0.0008054477511905134 }, "sample": { "messages": [ @@ -58687,10 +58687,10 @@ ] }, "predict": { - "A": 0.0016702363500371575, - "B": 0.0011479353997856379, - "C": 0.00015535615966655314, - "D": 0.9804043173789978 + "A": 0.0004691491194535047, + "B": 0.00020818364282604307, + "C": 8.15258827060461e-05, + "D": 0.961183488368988 }, "sample": { "messages": [ @@ -58732,10 +58732,10 @@ ] }, "predict": { - "A": 0.04567914456129074, - "B": 0.9174901843070984, - "C": 0.014829847030341625, - "D": 0.0012173079885542393 + "A": 0.0070066796615719795, + "B": 0.9176940321922302, + "C": 0.008996755816042423, + "D": 0.000289200252154842 }, "sample": { "messages": [ @@ -58777,10 +58777,10 @@ ] }, "predict": { - "A": 0.09097440540790558, - "B": 0.5235216021537781, - "C": 0.003527460852637887, - "D": 0.35981079936027527 + "A": 0.25673145055770874, + "B": 0.37354207038879395, + "C": 0.007752617821097374, + "D": 0.3296497166156769 }, "sample": { "messages": [ @@ -58822,10 +58822,10 @@ ] }, "predict": { - "A": 0.8668614625930786, - "B": 0.029662366956472397, - "C": 0.014011510647833347, - "D": 0.012365113943815231 + "A": 0.7258256673812866, + "B": 0.007115744519978762, + "C": 0.09822981804609299, + "D": 0.002310144016519189 }, "sample": { "messages": [ @@ -58867,10 +58867,10 @@ ] }, "predict": { - "A": 0.00020093348575755954, - "B": 0.9875416159629822, - "C": 8.376152982236817e-05, - "D": 0.00022768745839130133 + "A": 0.00017750838014762849, + "B": 0.9286785125732422, + "C": 9.501339081907645e-05, + "D": 0.0003115369181614369 }, "sample": { "messages": [ @@ -58912,10 +58912,10 @@ ] }, "predict": { - "A": 0.022208359092473984, - "B": 0.019598806276917458, - "C": 0.9443234205245972, - "D": 0.0023407437838613987 + "A": 0.013475487940013409, + "B": 0.02852761000394821, + "C": 0.9447046518325806, + "D": 0.0023416888434439898 }, "sample": { "messages": [ @@ -58957,10 +58957,10 @@ ] }, "predict": { - "A": 0.012147254310548306, - "B": 0.27647021412849426, - "C": 0.02571573480963707, - "D": 0.6632175445556641 + "A": 0.00242973561398685, + "B": 0.031509287655353546, + "C": 0.011591619811952114, + "D": 0.9208364486694336 }, "sample": { "messages": [ @@ -59002,10 +59002,10 @@ ] }, "predict": { - "A": 0.9567746520042419, - "B": 0.0023716073483228683, - "C": 7.623518467880785e-05, - "D": 0.0012694299221038818 + "A": 0.8925634622573853, + "B": 0.0002331898285774514, + "C": 3.35941840603482e-05, + "D": 0.0001933211606228724 }, "sample": { "messages": [ @@ -59047,10 +59047,10 @@ ] }, "predict": { - "A": 0.9156654477119446, - "B": 0.016770998015999794, - "C": 0.0009461549343541265, - "D": 0.013061265461146832 + "A": 0.786916971206665, + "B": 0.006008184980601072, + "C": 0.0004352314572315663, + "D": 0.004679179284721613 }, "sample": { "messages": [ @@ -59092,10 +59092,10 @@ ] }, "predict": { - "A": 0.09723901003599167, - "B": 0.05204828828573227, - "C": 0.8141722679138184, - "D": 0.004272383637726307 + "A": 0.06432248651981354, + "B": 0.009864172898232937, + "C": 0.8879445195198059, + "D": 0.001610281877219677 }, "sample": { "messages": [ @@ -59137,10 +59137,10 @@ ] }, "predict": { - "A": 0.9389511942863464, - "B": 0.041254639625549316, - "C": 0.003386386903002858, - "D": 0.004348207265138626 + "A": 0.9697825908660889, + "B": 0.003497582394629717, + "C": 0.00100207410287112, + "D": 0.00100207410287112 }, "sample": { "messages": [ @@ -59182,10 +59182,10 @@ ] }, "predict": { - "A": 0.019684379920363426, - "B": 0.9484465718269348, - "C": 0.004976991098374128, - "D": 0.01053629070520401 + "A": 0.013201544992625713, + "B": 0.9254997968673706, + "C": 0.0011535382363945246, + "D": 0.006235968787223101 }, "sample": { "messages": [ @@ -59227,10 +59227,10 @@ ] }, "predict": { - "A": 0.9467885494232178, - "B": 0.004384500905871391, - "C": 0.0004621230182237923, - "D": 0.017341038212180138 + "A": 0.9169662594795227, + "B": 0.0006512059480883181, + "C": 0.00011316261952742934, + "D": 0.006178469862788916 }, "sample": { "messages": [ @@ -59272,10 +59272,10 @@ ] }, "predict": { - "A": 0.010351044125854969, - "B": 0.0361286923289299, - "C": 0.00011498970707179978, - "D": 0.9317712783813477 + "A": 0.001103403978049755, + "B": 0.0026469288859516382, + "C": 9.641436190577224e-05, + "D": 0.9423718452453613 }, "sample": { "messages": [ @@ -59310,17 +59310,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.26878637075424194, - "B": 0.6447850465774536, - "C": 0.0033835212234407663, - "D": 0.059974346309900284 + "A": 0.7102676033973694, + "B": 0.17958377301692963, + "C": 0.00894095003604889, + "D": 0.014741133898496628 }, "sample": { "messages": [ @@ -59350,7 +59350,7 @@ "prompt_len": 78, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -59362,10 +59362,10 @@ ] }, "predict": { - "A": 0.0005362930241972208, - "B": 0.9696406126022339, - "C": 0.0001443413202650845, - "D": 0.00025332687073387206 + "A": 0.0005444632261060178, + "B": 0.9247701168060303, + "C": 0.0004804871277883649, + "D": 0.00033023362630046904 }, "sample": { "messages": [ @@ -59407,10 +59407,10 @@ ] }, "predict": { - "A": 0.9767365455627441, - "B": 0.0018855450907722116, - "C": 7.311052468139678e-05, - "D": 5.6938526540761814e-05 + "A": 0.9143048524856567, + "B": 0.0003475537523627281, + "C": 3.899444709531963e-05, + "D": 2.3651329684071243e-05 }, "sample": { "messages": [ @@ -59445,17 +59445,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.39383772015571594, - "B": 0.5056976675987244, - "C": 0.025177201256155968, - "D": 0.03232816606760025 + "A": 0.7730106711387634, + "B": 0.05599670484662056, + "C": 0.006687852554023266, + "D": 0.0035797497257590294 }, "sample": { "messages": [ @@ -59485,7 +59485,7 @@ "prompt_len": 90, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -59497,10 +59497,10 @@ ] }, "predict": { - "A": 0.00013817725994158536, - "B": 3.4936678275698796e-05, - "C": 3.9588441723026335e-05, - "D": 0.9880980849266052 + "A": 3.2107847800944e-05, + "B": 6.7301707531441934e-06, + "C": 1.7186095647048205e-05, + "D": 0.9666584730148315 }, "sample": { "messages": [ @@ -59535,17 +59535,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "D" + "A" ] }, "predict": { - "A": 0.24832822382450104, - "B": 0.10351859778165817, - "C": 0.026173602789640427, - "D": 0.595708429813385 + "A": 0.6150728464126587, + "B": 0.12111502140760422, + "C": 0.039320290088653564, + "D": 0.15551477670669556 }, "sample": { "messages": [ @@ -59575,7 +59575,7 @@ "prompt_len": 108, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -59587,10 +59587,10 @@ ] }, "predict": { - "A": 0.002414231188595295, - "B": 0.010819833725690842, - "C": 0.973970353603363, - "D": 0.003099934197962284 + "A": 0.0005832399474456906, + "B": 0.0010236180387437344, + "C": 0.9906322956085205, + "D": 0.0002431306056678295 }, "sample": { "messages": [ @@ -59632,10 +59632,10 @@ ] }, "predict": { - "A": 0.920403003692627, - "B": 0.035687897354364395, - "C": 0.0007406786899082363, - "D": 0.007027362938970327 + "A": 0.8602445125579834, + "B": 0.005115200765430927, + "C": 0.00030719165806658566, + "D": 0.0011413556057959795 }, "sample": { "messages": [ @@ -59677,10 +59677,10 @@ ] }, "predict": { - "A": 0.00641216803342104, - "B": 0.028737341985106468, - "C": 0.0004644959117285907, - "D": 0.951650083065033 + "A": 0.0027088718488812447, + "B": 0.0023905709385871887, + "C": 0.0002519642875995487, + "D": 0.9644250869750977 }, "sample": { "messages": [ @@ -59722,10 +59722,10 @@ ] }, "predict": { - "A": 0.14873027801513672, - "B": 0.14873027801513672, - "C": 0.6665628552436829, - "D": 0.01567605510354042 + "A": 0.05118439346551895, + "B": 0.01294144056737423, + "C": 0.9072650671005249, + "D": 0.0021126321516931057 }, "sample": { "messages": [ @@ -59763,14 +59763,14 @@ "acc": false, "f1_macro": [ "D", - "B" + "A" ] }, "predict": { - "A": 0.25871700048446655, - "B": 0.6206298470497131, - "C": 0.003256766591221094, - "D": 0.09517667442560196 + "A": 0.5040207505226135, + "B": 0.39253175258636475, + "C": 0.017246641218662262, + "D": 0.011853432282805443 }, "sample": { "messages": [ @@ -59800,7 +59800,7 @@ "prompt_len": 92, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -59812,10 +59812,10 @@ ] }, "predict": { - "A": 0.05896313488483429, - "B": 0.6339134573936462, - "C": 0.2058015763759613, - "D": 0.021691326051950455 + "A": 0.1758245974779129, + "B": 0.4217812716960907, + "C": 0.1758245974779129, + "D": 0.03923176974058151 }, "sample": { "messages": [ @@ -59857,10 +59857,10 @@ ] }, "predict": { - "A": 0.0010136167984455824, - "B": 0.9809533953666687, - "C": 0.0001003616489470005, - "D": 0.0003728887822944671 + "A": 0.002057920675724745, + "B": 0.9407675862312317, + "C": 0.00011609993816819042, + "D": 0.0004052286094520241 }, "sample": { "messages": [ @@ -59895,17 +59895,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "D" + "C" ] }, "predict": { - "A": 0.02860749326646328, - "B": 0.032416537404060364, - "C": 0.34851062297821045, - "D": 0.5745968818664551 + "A": 0.004061846062541008, + "B": 0.01820393092930317, + "C": 0.5319968461990356, + "D": 0.41431957483291626 }, "sample": { "messages": [ @@ -59935,7 +59935,7 @@ "prompt_len": 115, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " C" } } { @@ -59947,10 +59947,10 @@ ] }, "predict": { - "A": 0.008365499787032604, - "B": 0.9669203162193298, - "C": 0.012171730399131775, - "D": 0.0018665953539311886 + "A": 0.01182365883141756, + "B": 0.9392695426940918, + "C": 0.010434342548251152, + "D": 0.002328216563910246 }, "sample": { "messages": [ @@ -59992,10 +59992,10 @@ ] }, "predict": { - "A": 0.002164787147194147, - "B": 0.001022572978399694, - "C": 0.9896209239959717, - "D": 0.00022816687123849988 + "A": 0.0006976165459491313, + "B": 0.0002566387702245265, + "C": 0.9823172092437744, + "D": 0.000106982966826763 }, "sample": { "messages": [ @@ -60037,10 +60037,10 @@ ] }, "predict": { - "A": 0.0027439696714282036, - "B": 0.010852610692381859, - "C": 0.9769207835197449, - "D": 0.0011438566725701094 + "A": 0.004014114383608103, + "B": 0.004548587836325169, + "C": 0.9822214245796204, + "D": 0.00039745113463141024 }, "sample": { "messages": [ @@ -60082,10 +60082,10 @@ ] }, "predict": { - "A": 0.005807459820061922, - "B": 0.048625268042087555, - "C": 0.7606271505355835, - "D": 0.16971886157989502 + "A": 0.00214474112726748, + "B": 0.006206014193594456, + "C": 0.812827467918396, + "D": 0.16005520522594452 }, "sample": { "messages": [ @@ -60127,10 +60127,10 @@ ] }, "predict": { - "A": 0.10444312542676926, - "B": 0.10444312542676926, - "C": 0.003573847468942404, - "D": 0.7717361450195312 + "A": 0.03533071279525757, + "B": 0.021429162472486496, + "C": 0.0016524394741281867, + "D": 0.9111911654472351 }, "sample": { "messages": [ @@ -60172,10 +60172,10 @@ ] }, "predict": { - "A": 0.015562553890049458, - "B": 0.0021061627194285393, - "C": 0.0005668659578077495, - "D": 0.9628211259841919 + "A": 0.015341650694608688, + "B": 0.0008655167184770107, + "C": 0.00021883689623791724, + "D": 0.9491543173789978 }, "sample": { "messages": [ @@ -60217,10 +60217,10 @@ ] }, "predict": { - "A": 0.774152398109436, - "B": 0.1957363784313202, - "C": 0.00037786015309393406, - "D": 0.00037786015309393406 + "A": 0.510903537273407, + "B": 0.3978920578956604, + "C": 0.00019420922035351396, + "D": 0.00020673463586717844 }, "sample": { "messages": [ @@ -60262,10 +60262,10 @@ ] }, "predict": { - "A": 0.005807180888950825, - "B": 0.005124819464981556, - "C": 0.976617693901062, - "D": 0.001143501722253859 + "A": 0.0007913927547633648, + "B": 0.0004800040042027831, + "C": 0.9834227561950684, + "D": 0.0002911371411755681 }, "sample": { "messages": [ @@ -60307,10 +60307,10 @@ ] }, "predict": { - "A": 8.447570871794596e-05, - "B": 0.00022962878574617207, - "C": 0.9959617257118225, - "D": 0.00010846897203009576 + "A": 3.9777514757588506e-05, + "B": 3.297672083135694e-05, + "C": 0.9928172826766968, + "D": 4.234294465277344e-05 }, "sample": { "messages": [ @@ -60352,10 +60352,10 @@ ] }, "predict": { - "A": 0.9639328122138977, - "B": 0.003476484678685665, - "C": 0.0009960294701159, - "D": 0.001449214294552803 + "A": 0.8996295928955078, + "B": 0.0012705923290923238, + "C": 0.0017366937827318907, + "D": 0.0017366937827318907 }, "sample": { "messages": [ @@ -60397,10 +60397,10 @@ ] }, "predict": { - "A": 0.20013096928596497, - "B": 0.02708478271961212, - "C": 0.12138556689023972, - "D": 0.6164467930793762 + "A": 0.12946650385856628, + "B": 0.005019961390644312, + "C": 0.11425378918647766, + "D": 0.6574852466583252 }, "sample": { "messages": [ @@ -60442,10 +60442,10 @@ ] }, "predict": { - "A": 0.00965079665184021, - "B": 0.00037420200533233583, - "C": 0.9844079613685608, - "D": 9.461307490710169e-05 + "A": 0.005177538376301527, + "B": 4.7683482989668846e-05, + "C": 0.9866641759872437, + "D": 1.7541773559059948e-05 }, "sample": { "messages": [ @@ -60487,10 +60487,10 @@ ] }, "predict": { - "A": 0.025045111775398254, - "B": 0.010440356098115444, - "C": 0.0055883196182549, - "D": 0.9398108720779419 + "A": 0.011951343156397343, + "B": 0.002210776088759303, + "C": 0.005645414814352989, + "D": 0.9494128227233887 }, "sample": { "messages": [ @@ -60532,10 +60532,10 @@ ] }, "predict": { - "A": 0.9510617256164551, - "B": 0.0034300643019378185, - "C": 0.0004096627526450902, - "D": 0.0018359810346737504 + "A": 0.8655456304550171, + "B": 0.0005424605333246291, + "C": 0.0002903582062572241, + "D": 0.0004787197685800493 }, "sample": { "messages": [ @@ -60577,10 +60577,10 @@ ] }, "predict": { - "A": 0.003976270090788603, - "B": 0.9729612469673157, - "C": 0.0007829740061424673, - "D": 0.0011392204323783517 + "A": 0.0034755875822156668, + "B": 0.8504482507705688, + "C": 0.0023887341376394033, + "D": 0.0012785971630364656 }, "sample": { "messages": [ @@ -60622,10 +60622,10 @@ ] }, "predict": { - "A": 0.035597797483205795, - "B": 0.031414944678545, - "C": 0.9180793166160583, - "D": 0.0021378137171268463 + "A": 0.041193895041942596, + "B": 0.0071584247052669525, + "C": 0.9375686645507812, + "D": 0.0010977800702676177 }, "sample": { "messages": [ @@ -60660,17 +60660,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.21234317123889923, - "B": 0.7411504983901978, - "C": 0.013574644923210144, - "D": 0.015382086858153343 + "A": 0.528624415397644, + "B": 0.41169309616088867, + "C": 0.008544418029487133, + "D": 0.023226136341691017 }, "sample": { "messages": [ @@ -60700,7 +60700,7 @@ "prompt_len": 63, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -60712,10 +60712,10 @@ ] }, "predict": { - "A": 0.000697656418196857, - "B": 0.0018964267801493406, - "C": 0.004549291450530291, - "D": 0.9823733568191528 + "A": 3.621692303568125e-05, + "B": 6.356267840601504e-05, + "C": 0.00018392469792161137, + "D": 0.9622470736503601 }, "sample": { "messages": [ @@ -60757,10 +60757,10 @@ ] }, "predict": { - "A": 0.04647358879446983, - "B": 0.38911858201026917, - "C": 0.04101279750466347, - "D": 0.499638170003891 + "A": 0.04425835609436035, + "B": 0.37057068943977356, + "C": 0.016281738877296448, + "D": 0.5391771197319031 }, "sample": { "messages": [ @@ -60802,10 +60802,10 @@ ] }, "predict": { - "A": 0.01108035072684288, - "B": 0.8802212476730347, - "C": 0.034129880368709564, - "D": 0.04965868964791298 + "A": 0.0009517150465399027, + "B": 0.9210464358329773, + "C": 0.006205962039530277, + "D": 0.02166093699634075 }, "sample": { "messages": [ @@ -60847,10 +60847,10 @@ ] }, "predict": { - "A": 0.0027665498200803995, - "B": 0.0027665498200803995, - "C": 0.9849599003791809, - "D": 0.00018826605810318142 + "A": 0.0019023013301193714, + "B": 0.00035189033951610327, + "C": 0.9854164719581604, + "D": 7.376023131655529e-05 }, "sample": { "messages": [ @@ -60892,10 +60892,10 @@ ] }, "predict": { - "A": 0.012648453004658222, - "B": 0.08247827738523483, - "C": 0.8867250680923462, - "D": 0.001711782068014145 + "A": 0.009482447057962418, + "B": 0.01379687711596489, + "C": 0.967235803604126, + "D": 0.00011213434481760487 }, "sample": { "messages": [ @@ -60937,10 +60937,10 @@ ] }, "predict": { - "A": 0.03895004466176033, - "B": 0.8864989280700684, - "C": 0.007669718936085701, - "D": 0.03895004466176033 + "A": 0.034209322184324265, + "B": 0.882270097732544, + "C": 0.009207314811646938, + "D": 0.04977427423000336 }, "sample": { "messages": [ @@ -60982,10 +60982,10 @@ ] }, "predict": { - "A": 0.0010122853564098477, - "B": 0.000893338758032769, - "C": 0.0005767828552052379, - "D": 0.9796649217605591 + "A": 0.0002870603930205107, + "B": 0.00010560361261013895, + "C": 0.00011241446190979332, + "D": 0.969651997089386 }, "sample": { "messages": [ @@ -61027,10 +61027,10 @@ ] }, "predict": { - "A": 0.6397069692611694, - "B": 0.26666954159736633, - "C": 0.03184913471341133, - "D": 0.02810676395893097 + "A": 0.6135982275009155, + "B": 0.10662737488746643, + "C": 0.17579883337020874, + "D": 0.026959624141454697 }, "sample": { "messages": [ @@ -61065,17 +61065,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.2359113246202469, - "B": 0.726658046245575, - "C": 0.0018012053333222866, - "D": 0.0007508540293201804 + "A": 0.43571388721466064, + "B": 0.43571388721466064, + "C": 0.0015714296605437994, + "D": 0.0006973177078180015 }, "sample": { "messages": [ @@ -61105,7 +61105,7 @@ "prompt_len": 65, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -61117,10 +61117,10 @@ ] }, "predict": { - "A": 0.5832057595252991, - "B": 0.14745751023292542, - "C": 0.1670912653207779, - "D": 0.061469435691833496 + "A": 0.8515485525131226, + "B": 0.015596656128764153, + "C": 0.04239610210061073, + "D": 0.007367338519543409 }, "sample": { "messages": [ @@ -61155,17 +61155,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.005920374300330877, - "B": 0.6843022704124451, - "C": 0.22216041386127472, - "D": 0.04957069084048271 + "A": 0.0025265696458518505, + "B": 0.11436127126216888, + "C": 0.8450218439102173, + "D": 0.019873009994626045 }, "sample": { "messages": [ @@ -61195,22 +61195,22 @@ "prompt_len": 143, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "A" + "D" ] }, "predict": { - "A": 0.43823671340942383, - "B": 0.01699228212237358, - "C": 0.09778383374214172, - "D": 0.3867425322532654 + "A": 0.16280856728553772, + "B": 0.003596909809857607, + "C": 0.1118965670466423, + "D": 0.643920361995697 }, "sample": { "messages": [ @@ -61240,7 +61240,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -61252,10 +61252,10 @@ ] }, "predict": { - "A": 0.05073004961013794, - "B": 0.6180185079574585, - "C": 0.2006412297487259, - "D": 0.0947762131690979 + "A": 0.025104476138949394, + "B": 0.5042368769645691, + "C": 0.0876232236623764, + "D": 0.3465566337108612 }, "sample": { "messages": [ @@ -61297,10 +61297,10 @@ ] }, "predict": { - "A": 0.0058492012321949005, - "B": 0.0011517760576680303, - "C": 0.9836844205856323, - "D": 0.00035127182491123676 + "A": 0.003957727458328009, + "B": 0.00025300902780145407, + "C": 0.9684239029884338, + "D": 6.0094909713370726e-05 }, "sample": { "messages": [ @@ -61342,10 +61342,10 @@ ] }, "predict": { - "A": 0.003998269326984882, - "B": 0.0007873058784753084, - "C": 0.0024250729475170374, - "D": 0.978344202041626 + "A": 0.0014477063668891788, + "B": 0.00016242815763689578, + "C": 0.0018588918028399348, + "D": 0.9629297852516174 }, "sample": { "messages": [ @@ -61387,10 +61387,10 @@ ] }, "predict": { - "A": 0.0005434053600765765, - "B": 0.9824999570846558, - "C": 0.00017641788872424513, - "D": 0.0031270820181816816 + "A": 0.00029216406983323395, + "B": 0.92709881067276, + "C": 0.0002275376027682796, + "D": 0.00908895768225193 }, "sample": { "messages": [ @@ -61425,17 +61425,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.16866125166416168, - "B": 0.04832225665450096, - "C": 0.0024058236740529537, - "D": 0.7558872699737549 + "A": 0.46643829345703125, + "B": 0.005181661807000637, + "C": 0.001020329655148089, + "D": 0.46643829345703125 }, "sample": { "messages": [ @@ -61465,7 +61465,7 @@ "prompt_len": 77, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -61477,10 +61477,10 @@ ] }, "predict": { - "A": 0.003564976854249835, - "B": 0.0013114816974848509, - "C": 0.9884693026542664, - "D": 0.00010113060852745548 + "A": 0.0016826814971864223, + "B": 0.0003313395718578249, + "C": 0.9877094030380249, + "D": 3.9572867535753176e-05 }, "sample": { "messages": [ @@ -61522,10 +61522,10 @@ ] }, "predict": { - "A": 0.0028994001913815737, - "B": 0.00838969275355339, - "C": 0.9697166085243225, - "D": 0.012206930667161942 + "A": 0.0042556715197861195, + "B": 0.003528075758367777, + "C": 0.7618521451950073, + "D": 0.2182743102312088 }, "sample": { "messages": [ @@ -61567,10 +61567,10 @@ ] }, "predict": { - "A": 0.06266651302576065, - "B": 0.1326649934053421, - "C": 0.023053722456097603, - "D": 0.7634344100952148 + "A": 0.03549552336335182, + "B": 0.01305807288736105, + "C": 0.011523709632456303, + "D": 0.915441632270813 }, "sample": { "messages": [ @@ -61612,10 +61612,10 @@ ] }, "predict": { - "A": 0.013889179565012455, - "B": 0.9737067222595215, - "C": 0.0006915015401318669, - "D": 0.0003266422136221081 + "A": 0.015019653365015984, + "B": 0.9292330741882324, + "C": 0.00035322841722518206, + "D": 0.00024277008196804672 }, "sample": { "messages": [ @@ -61657,10 +61657,10 @@ ] }, "predict": { - "A": 0.02515016868710518, - "B": 0.015254348516464233, - "C": 0.9437531232833862, - "D": 0.002650808310136199 + "A": 0.004521307069808245, + "B": 0.003990039229393005, + "C": 0.9763303399085999, + "D": 0.0006933648255653679 }, "sample": { "messages": [ @@ -61702,10 +61702,10 @@ ] }, "predict": { - "A": 0.00025789832579903305, - "B": 0.0002422730904072523, - "C": 0.0003752398770302534, - "D": 0.9871383309364319 + "A": 0.0001531065790913999, + "B": 5.29122153238859e-05, + "C": 0.00016298110131174326, + "D": 0.9662078619003296 }, "sample": { "messages": [ @@ -61740,17 +61740,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "C" ] }, "predict": { - "A": 0.5131979584693909, - "B": 0.166610985994339, - "C": 0.2746950685977936, - "D": 0.006460208911448717 + "A": 0.20061400532722473, + "B": 0.03950324282050133, + "C": 0.7002117037773132, + "D": 0.0003872761153616011 }, "sample": { "messages": [ @@ -61780,7 +61780,7 @@ "prompt_len": 80, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -61792,10 +61792,10 @@ ] }, "predict": { - "A": 0.37653857469558716, - "B": 0.37653857469558716, - "C": 0.021242855116724968, - "D": 0.2015465945005417 + "A": 0.32785362005233765, + "B": 0.2893298268318176, + "C": 0.030495136976242065, + "D": 0.2893298268318176 }, "sample": { "messages": [ @@ -61837,10 +61837,10 @@ ] }, "predict": { - "A": 0.8764761686325073, - "B": 0.08152498304843903, - "C": 0.0014931822661310434, - "D": 0.001317728660069406 + "A": 0.8656182885169983, + "B": 0.013991416431963444, + "C": 0.0006965916254557669, + "D": 0.00039690593257546425 }, "sample": { "messages": [ @@ -61882,10 +61882,10 @@ ] }, "predict": { - "A": 0.0014689717208966613, - "B": 0.9770743250846863, - "C": 0.0016645633149892092, - "D": 0.0005752576398663223 + "A": 0.0018090968951582909, + "B": 0.9371353983879089, + "C": 0.0014089259784668684, + "D": 0.0009683397365733981 }, "sample": { "messages": [ @@ -61927,10 +61927,10 @@ ] }, "predict": { - "A": 0.019254721701145172, - "B": 0.927744448184967, - "C": 0.0169922336935997, - "D": 0.01167857926338911 + "A": 0.01857592724263668, + "B": 0.8950383067131042, + "C": 0.00603072065860033, + "D": 0.004696730058640242 }, "sample": { "messages": [ @@ -61972,10 +61972,10 @@ ] }, "predict": { - "A": 0.00011439722584327683, - "B": 0.00024217893951572478, - "C": 0.00012962904293090105, - "D": 0.9867547154426575 + "A": 6.762570410501212e-05, + "B": 5.606367267319001e-05, + "C": 7.1987189585343e-05, + "D": 0.961729109287262 }, "sample": { "messages": [ @@ -62010,17 +62010,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.003155092243105173, - "B": 0.8748196363449097, - "C": 0.09220530837774277, - "D": 0.007568672765046358 + "A": 0.003194920951500535, + "B": 0.41845208406448364, + "C": 0.5373031497001648, + "D": 0.00766421714797616 }, "sample": { "messages": [ @@ -62050,7 +62050,7 @@ "prompt_len": 83, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -62062,10 +62062,10 @@ ] }, "predict": { - "A": 0.0007051420980133116, - "B": 0.0009054203401319683, - "C": 0.9929139614105225, - "D": 0.00015733846521470696 + "A": 0.000242963113123551, + "B": 0.00013004877837374806, + "C": 0.9899498224258423, + "D": 3.9662631024839357e-05 }, "sample": { "messages": [ @@ -62107,10 +62107,10 @@ ] }, "predict": { - "A": 0.0024152956902980804, - "B": 0.9743998050689697, - "C": 0.0012928146170452237, - "D": 0.003101301146671176 + "A": 0.0033312784507870674, + "B": 0.9236711859703064, + "C": 0.006223647389560938, + "D": 0.0024372152984142303 }, "sample": { "messages": [ @@ -62148,14 +62148,14 @@ "acc": false, "f1_macro": [ "D", - "A" + "B" ] }, "predict": { - "A": 0.5611332058906555, - "B": 0.23391512036323547, - "C": 0.1418766975402832, - "D": 0.019200921058654785 + "A": 0.21216055750846863, + "B": 0.5089467167854309, + "C": 0.16523081064224243, + "D": 0.02533896453678608 }, "sample": { "messages": [ @@ -62185,7 +62185,7 @@ "prompt_len": 94, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " B" } } { @@ -62197,10 +62197,10 @@ ] }, "predict": { - "A": 0.14397409558296204, - "B": 0.011818114668130875, - "C": 0.8285138010978699, - "D": 0.001812366652302444 + "A": 0.052503496408462524, + "B": 0.0012347637675702572, + "C": 0.9306466579437256, + "D": 0.00035376576124690473 }, "sample": { "messages": [ @@ -62242,10 +62242,10 @@ ] }, "predict": { - "A": 0.021198377013206482, - "B": 0.9013779163360596, - "C": 0.024020908400416374, - "D": 0.010013405233621597 + "A": 0.023814871907234192, + "B": 0.8936465382575989, + "C": 0.014444450847804546, + "D": 0.006409685593098402 }, "sample": { "messages": [ @@ -62287,10 +62287,10 @@ ] }, "predict": { - "A": 0.9009584784507751, - "B": 0.03958535194396973, - "C": 0.014562636613845825, - "D": 0.010008744895458221 + "A": 0.9101681113243103, + "B": 0.004214914981275797, + "C": 0.004776123911142349, + "D": 0.004776123911142349 }, "sample": { "messages": [ @@ -62332,10 +62332,10 @@ ] }, "predict": { - "A": 0.0002270133700221777, - "B": 0.0004805873613804579, - "C": 9.463325841352344e-05, - "D": 0.9846179485321045 + "A": 3.561465200618841e-05, + "B": 7.082820229697973e-05, + "C": 4.8679463361622766e-05, + "D": 0.9462453722953796 }, "sample": { "messages": [ @@ -62377,10 +62377,10 @@ ] }, "predict": { - "A": 0.00037427240749821067, - "B": 0.9845932126045227, - "C": 3.9448015741072595e-05, - "D": 0.000291483651380986 + "A": 0.00035628137993626297, + "B": 0.9372645020484924, + "C": 2.747346843534615e-05, + "D": 0.00029536764486692846 }, "sample": { "messages": [ @@ -62422,10 +62422,10 @@ ] }, "predict": { - "A": 0.02727874368429184, - "B": 0.903347909450531, - "C": 0.02407340705394745, - "D": 0.014601259492337704 + "A": 0.24632567167282104, + "B": 0.4061223864555359, + "C": 0.19183863699436188, + "D": 0.07057347893714905 }, "sample": { "messages": [ @@ -62467,10 +62467,10 @@ ] }, "predict": { - "A": 0.007323778700083494, - "B": 0.022558828815817833, - "C": 0.9592257738113403, - "D": 0.00025060592452064157 + "A": 0.004547111690044403, + "B": 0.003125180955976248, + "C": 0.9819025993347168, + "D": 0.00010693781223380938 }, "sample": { "messages": [ @@ -62512,10 +62512,10 @@ ] }, "predict": { - "A": 0.12836267054080963, - "B": 0.0030188006348907948, - "C": 0.0020747892558574677, - "D": 0.8370296955108643 + "A": 0.2810550630092621, + "B": 0.0007894258596934378, + "C": 0.001671214704401791, + "D": 0.674216091632843 }, "sample": { "messages": [ @@ -62557,10 +62557,10 @@ ] }, "predict": { - "A": 0.009734715335071087, - "B": 0.6824563145637512, - "C": 0.007581404875963926, - "D": 0.28449010848999023 + "A": 0.011567936278879642, + "B": 0.9189549684524536, + "C": 0.0054643056355416775, + "D": 0.007468821480870247 }, "sample": { "messages": [ @@ -62602,10 +62602,10 @@ ] }, "predict": { - "A": 0.727364718914032, - "B": 0.03195817768573761, - "C": 0.20839348435401917, - "D": 0.002972573507577181 + "A": 0.9337947368621826, + "B": 0.005552546586841345, + "C": 0.006291859783232212, + "D": 0.0009648879058659077 }, "sample": { "messages": [ @@ -62647,10 +62647,10 @@ ] }, "predict": { - "A": 0.0018689722055569291, - "B": 0.9681515097618103, - "C": 7.246791938086972e-05, - "D": 0.0001633089705137536 + "A": 0.006017894484102726, + "B": 0.8931347131729126, + "C": 8.584065653849393e-05, + "D": 0.000193444881006144 }, "sample": { "messages": [ @@ -62692,10 +62692,10 @@ ] }, "predict": { - "A": 0.008664490655064583, - "B": 0.0006276535568758845, - "C": 0.09315204620361328, - "D": 0.883802056312561 + "A": 0.003261524485424161, + "B": 9.848950867308304e-05, + "C": 0.05101883038878441, + "D": 0.904330313205719 }, "sample": { "messages": [ @@ -62737,10 +62737,10 @@ ] }, "predict": { - "A": 0.530489981174469, - "B": 0.10445965826511383, - "C": 0.3217584192752838, - "D": 0.00971626304090023 + "A": 0.6742410659790039, + "B": 0.009617531672120094, + "C": 0.24803945422172546, + "D": 0.002755469176918268 }, "sample": { "messages": [ @@ -62782,10 +62782,10 @@ ] }, "predict": { - "A": 0.2306973934173584, - "B": 0.021458201110363007, - "C": 0.7105979919433594, - "D": 0.0007342591998167336 + "A": 0.3403545022010803, + "B": 0.006233810447156429, + "C": 0.5611496567726135, + "D": 0.0002915591758210212 }, "sample": { "messages": [ @@ -62827,10 +62827,10 @@ ] }, "predict": { - "A": 0.0008487647864967585, - "B": 0.0004836116568185389, - "C": 0.9908140301704407, - "D": 0.00042678575846366584 + "A": 0.00037594643072225153, + "B": 0.00013830297393724322, + "C": 0.9889971017837524, + "D": 8.388498099520802e-05 }, "sample": { "messages": [ @@ -62872,10 +62872,10 @@ ] }, "predict": { - "A": 0.8885850310325623, - "B": 0.04424004256725311, - "C": 0.004662866238504648, - "D": 0.012674984522163868 + "A": 0.8971861600875854, + "B": 0.004154796712100506, + "C": 0.0011903690174221992, + "D": 0.001731976750306785 }, "sample": { "messages": [ @@ -62917,10 +62917,10 @@ ] }, "predict": { - "A": 0.9335343241691589, - "B": 0.04101663827896118, - "C": 0.0010930562857538462, - "D": 0.002314000390470028 + "A": 0.9193279147148132, + "B": 0.0037570830900222063, + "C": 0.00021196011221036315, + "D": 0.0003959936439059675 }, "sample": { "messages": [ @@ -62962,10 +62962,10 @@ ] }, "predict": { - "A": 0.0075339968316257, - "B": 0.000700770877301693, - "C": 0.9867588877677917, - "D": 3.078972440562211e-05 + "A": 0.006634571123868227, + "B": 0.00010723786544986069, + "C": 0.9846577048301697, + "D": 1.2807735402020626e-05 }, "sample": { "messages": [ @@ -63007,10 +63007,10 @@ ] }, "predict": { - "A": 0.11569848656654358, - "B": 0.010761636309325695, - "C": 0.8549026250839233, - "D": 0.0034937916789203882 + "A": 0.2113378643989563, + "B": 0.0028319291304796934, + "C": 0.7376416325569153, + "D": 0.0008636895217932761 }, "sample": { "messages": [ @@ -63052,10 +63052,10 @@ ] }, "predict": { - "A": 0.04098275303840637, - "B": 0.7264366745948792, - "C": 0.20812758803367615, - "D": 0.006284908391535282 + "A": 0.12112797796726227, + "B": 0.5428578853607178, + "C": 0.25642794370651245, + "D": 0.003657747758552432 }, "sample": { "messages": [ @@ -63097,10 +63097,10 @@ ] }, "predict": { - "A": 0.0928148403763771, - "B": 0.3239556550979614, - "C": 0.5341125726699829, - "D": 0.00811007060110569 + "A": 0.13975851237773895, + "B": 0.03118434175848961, + "C": 0.8042547702789307, + "D": 0.004220341797918081 }, "sample": { "messages": [ @@ -63142,10 +63142,10 @@ ] }, "predict": { - "A": 0.7947526574134827, - "B": 0.057571690529584885, - "C": 0.06523726880550385, - "D": 0.03491899371147156 + "A": 0.712775707244873, + "B": 0.005442109424620867, + "C": 0.10930769890546799, + "D": 0.008972521871328354 }, "sample": { "messages": [ @@ -63187,10 +63187,10 @@ ] }, "predict": { - "A": 0.06925099343061447, - "B": 0.5798314213752747, - "C": 0.03706738352775574, - "D": 0.24170969426631927 + "A": 0.04734480753540993, + "B": 0.4491950571537018, + "C": 0.03253958001732826, + "D": 0.3498334586620331 }, "sample": { "messages": [ @@ -63232,10 +63232,10 @@ ] }, "predict": { - "A": 0.0027628212701529264, - "B": 0.9836323857307434, - "C": 0.001016385038383305, - "D": 0.0004236923123244196 + "A": 0.0022683993447571993, + "B": 0.9151375889778137, + "C": 0.0012141866609454155, + "D": 0.00041961169335991144 }, "sample": { "messages": [ @@ -63277,10 +63277,10 @@ ] }, "predict": { - "A": 0.004576385486871004, - "B": 0.001907721278257668, - "C": 0.9882240295410156, - "D": 0.0006193463341332972 + "A": 0.012283973395824432, + "B": 0.0011425875127315521, + "C": 0.9758368730545044, + "D": 0.0006115830037742853 }, "sample": { "messages": [ @@ -63322,10 +63322,10 @@ ] }, "predict": { - "A": 0.9638332724571228, - "B": 0.004463433753699064, - "C": 0.0003663809329736978, - "D": 0.004463433753699064 + "A": 0.956026554107666, + "B": 0.000987859908491373, + "C": 9.781138214748353e-05, + "D": 0.0006378103280439973 }, "sample": { "messages": [ @@ -63367,10 +63367,10 @@ ] }, "predict": { - "A": 0.7975115180015564, - "B": 0.07418012619018555, - "C": 0.05098320171236992, - "D": 0.039705757051706314 + "A": 0.6673521995544434, + "B": 0.01778433471918106, + "C": 0.1314094364643097, + "D": 0.005773727782070637 }, "sample": { "messages": [ @@ -63412,10 +63412,10 @@ ] }, "predict": { - "A": 0.02633461356163025, - "B": 0.08111631870269775, - "C": 0.00429900735616684, - "D": 0.8720825910568237 + "A": 0.010701846331357956, + "B": 0.005381226073950529, + "C": 0.00105962622910738, + "D": 0.963349461555481 }, "sample": { "messages": [ @@ -63457,10 +63457,10 @@ ] }, "predict": { - "A": 0.00934670865535736, - "B": 0.9533901214599609, - "C": 0.0016242144629359245, - "D": 0.013599378988146782 + "A": 0.006299623753875494, + "B": 0.9349470734596252, + "C": 0.0013204729184508324, + "D": 0.007138408720493317 }, "sample": { "messages": [ @@ -63502,10 +63502,10 @@ ] }, "predict": { - "A": 0.9536580443382263, - "B": 0.0026786294765770435, - "C": 0.0009854126255959272, - "D": 0.00011769075354095548 + "A": 0.8170912265777588, + "B": 0.00024189555551856756, + "C": 0.0001142633700510487, + "D": 1.2819994481105823e-05 }, "sample": { "messages": [ @@ -63547,10 +63547,10 @@ ] }, "predict": { - "A": 0.41978612542152405, - "B": 0.1202707439661026, - "C": 0.13628460466861725, - "D": 0.25461316108703613 + "A": 0.5277895927429199, + "B": 0.010953923687338829, + "C": 0.10392793267965317, + "D": 0.2493101805448532 }, "sample": { "messages": [ @@ -63592,10 +63592,10 @@ ] }, "predict": { - "A": 0.0507267527282238, - "B": 0.023961620405316353, - "C": 0.8991531729698181, - "D": 0.006058446131646633 + "A": 0.008380305022001266, + "B": 0.0012851613573729992, + "C": 0.9686315655708313, + "D": 0.0006462197634391487 }, "sample": { "messages": [ @@ -63637,10 +63637,10 @@ ] }, "predict": { - "A": 0.0014869474107399583, - "B": 0.0016849323874339461, - "C": 0.989030659198761, - "D": 0.0016849323874339461 + "A": 0.00013865064829587936, + "B": 0.00020173548546154052, + "C": 0.9914832711219788, + "D": 0.00010798122821142897 }, "sample": { "messages": [ @@ -63682,10 +63682,10 @@ ] }, "predict": { - "A": 0.10413284599781036, - "B": 0.4118534028530121, - "C": 0.018095575273036957, - "D": 0.22044925391674042 + "A": 0.023909546434879303, + "B": 0.6987393498420715, + "C": 0.00879583042114973, + "D": 0.08345252275466919 }, "sample": { "messages": [ @@ -63727,10 +63727,10 @@ ] }, "predict": { - "A": 0.0035314098931849003, - "B": 0.9791620969772339, - "C": 0.00037220786907710135, - "D": 0.000328472291585058 + "A": 0.01052013412117958, + "B": 0.9469922780990601, + "C": 0.0005575466784648597, + "D": 0.0009192390134558082 }, "sample": { "messages": [ @@ -63772,10 +63772,10 @@ ] }, "predict": { - "A": 0.019340911880135536, - "B": 0.9318973422050476, - "C": 0.009136000648140907, - "D": 0.0029660251457244158 + "A": 0.04709721356630325, + "B": 0.8348180055618286, + "C": 0.017326094210147858, + "D": 0.0059877377934753895 }, "sample": { "messages": [ @@ -63817,10 +63817,10 @@ ] }, "predict": { - "A": 0.0014774929732084274, - "B": 0.9827421307563782, - "C": 0.00032967323204502463, - "D": 0.0007908450206741691 + "A": 0.0005281971534714103, + "B": 0.9550029039382935, + "C": 0.00015133102715481073, + "D": 0.001118193380534649 }, "sample": { "messages": [ @@ -63855,17 +63855,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.03288356959819794, - "B": 0.6604841947555542, - "C": 0.24297857284545898, - "D": 0.008314268663525581 + "A": 0.0476033091545105, + "B": 0.35174354910850525, + "C": 0.45164763927459717, + "D": 0.006052081938832998 }, "sample": { "messages": [ @@ -63895,7 +63895,7 @@ "prompt_len": 91, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -63907,10 +63907,10 @@ ] }, "predict": { - "A": 0.9717217087745667, - "B": 0.008407039567828178, - "C": 0.00032597631798125803, - "D": 0.0008860955131240189 + "A": 0.8651078939437866, + "B": 0.0011478082742542028, + "C": 0.00029021137743256986, + "D": 0.0003288525913376361 }, "sample": { "messages": [ @@ -63952,10 +63952,10 @@ ] }, "predict": { - "A": 0.9445730447769165, - "B": 0.004956664517521858, - "C": 0.0002796358894556761, - "D": 0.0020662453025579453 + "A": 0.8341097235679626, + "B": 0.00033751846058294177, + "C": 8.533802611054853e-05, + "D": 0.0008618835709057748 }, "sample": { "messages": [ @@ -63997,10 +63997,10 @@ ] }, "predict": { - "A": 0.9580171704292297, - "B": 0.0023746872320771217, - "C": 0.0009899168508127332, - "D": 0.000364170060493052 + "A": 0.850501298904419, + "B": 0.00044189891195856035, + "C": 0.0006844264571554959, + "D": 0.0001049601924023591 }, "sample": { "messages": [ @@ -64035,17 +64035,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "C" + "D" ] }, "predict": { - "A": 0.015361947007477283, - "B": 0.0018347229342907667, - "C": 0.5087177753448486, - "D": 0.4489418864250183 + "A": 0.024266880005598068, + "B": 0.0013690437190234661, + "C": 0.37959787249565125, + "D": 0.5523116588592529 }, "sample": { "messages": [ @@ -64075,7 +64075,7 @@ "prompt_len": 82, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " D" } } { @@ -64087,10 +64087,10 @@ ] }, "predict": { - "A": 0.006618539337068796, - "B": 0.0018962433096021414, - "C": 0.0010149858426302671, - "D": 0.9822783470153809 + "A": 0.012235944159328938, + "B": 0.0009435346000827849, + "C": 0.001372834900394082, + "D": 0.9720214009284973 }, "sample": { "messages": [ @@ -64132,10 +64132,10 @@ ] }, "predict": { - "A": 0.00407084496691823, - "B": 0.7757657766342163, - "C": 0.0019229311728850007, - "D": 0.19614429771900177 + "A": 0.0016545276157557964, + "B": 0.6270432472229004, + "C": 0.0005717894528061152, + "D": 0.3356320858001709 }, "sample": { "messages": [ @@ -64177,10 +64177,10 @@ ] }, "predict": { - "A": 0.9049163460731506, - "B": 0.03975924849510193, - "C": 0.0060972785577178, - "D": 0.004748565144836903 + "A": 0.7918583750724792, + "B": 0.03479183092713356, + "C": 0.004155291244387627, + "D": 0.007763106841593981 }, "sample": { "messages": [ @@ -64222,10 +64222,10 @@ ] }, "predict": { - "A": 0.00012194897863082588, - "B": 8.381422230741009e-05, - "C": 0.00020105988369323313, - "D": 0.9881628155708313 + "A": 6.355625373544171e-05, + "B": 4.949766298523173e-05, + "C": 0.00017276381549891084, + "D": 0.9621497988700867 }, "sample": { "messages": [ @@ -64267,10 +64267,10 @@ ] }, "predict": { - "A": 0.00010697913239710033, - "B": 0.9822820425033569, - "C": 3.935541826649569e-05, - "D": 0.0003295188944321126 + "A": 0.0001209816982736811, + "B": 0.9209299087524414, + "C": 5.368530037230812e-05, + "D": 0.00014593177183996886 }, "sample": { "messages": [ @@ -64312,10 +64312,10 @@ ] }, "predict": { - "A": 0.96671462059021, - "B": 0.009477336890995502, - "C": 0.0012826180318370461, - "D": 0.005072852596640587 + "A": 0.9362465143203735, + "B": 0.0009674213360995054, + "C": 0.001595008303411305, + "D": 0.0009674213360995054 }, "sample": { "messages": [ @@ -64350,17 +64350,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "D" + "A" ] }, "predict": { - "A": 0.41390833258628845, - "B": 0.0066902050748467445, - "C": 0.002788892248645425, - "D": 0.5314688086509705 + "A": 0.4955625832080841, + "B": 0.0013919334160163999, + "C": 0.0013919334160163999, + "D": 0.38594454526901245 }, "sample": { "messages": [ @@ -64390,7 +64390,7 @@ "prompt_len": 79, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -64402,10 +64402,10 @@ ] }, "predict": { - "A": 0.935808002948761, - "B": 0.0007530755829066038, - "C": 0.0008533464279025793, - "D": 0.00045676346053369343 + "A": 0.865760326385498, + "B": 0.0001003700090222992, + "C": 0.0006148408283479512, + "D": 0.00018751583411358297 }, "sample": { "messages": [ @@ -64447,10 +64447,10 @@ ] }, "predict": { - "A": 0.5827919840812683, - "B": 0.40054669976234436, - "C": 0.0021018763072788715, - "D": 0.0007732370286248624 + "A": 0.5329761505126953, + "B": 0.41508227586746216, + "C": 0.0013211158802732825, + "D": 0.0002156662376364693 }, "sample": { "messages": [ @@ -64492,10 +64492,10 @@ ] }, "predict": { - "A": 0.25158029794692993, - "B": 0.683866024017334, - "C": 0.02651636302471161, - "D": 0.020650966092944145 + "A": 0.38232293725013733, + "B": 0.5562765598297119, + "C": 0.013082372024655342, + "D": 0.01018856093287468 }, "sample": { "messages": [ @@ -64530,17 +64530,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.19244149327278137, - "B": 0.5231101512908936, - "C": 0.015796558931469917, - "D": 0.24709974229335785 + "A": 0.3225831687450409, + "B": 0.2846786379814148, + "C": 0.02336784638464451, + "D": 0.3225831687450409 }, "sample": { "messages": [ @@ -64570,7 +64570,7 @@ "prompt_len": 89, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -64582,10 +64582,10 @@ ] }, "predict": { - "A": 0.0020034825429320335, - "B": 0.9158815145492554, - "C": 0.0017680672463029623, - "D": 0.06634623557329178 + "A": 0.0009630820131860673, + "B": 0.9320470690727234, + "C": 0.0005841387319378555, + "D": 0.03189289569854736 }, "sample": { "messages": [ @@ -64627,10 +64627,10 @@ ] }, "predict": { - "A": 0.0005437110085040331, - "B": 0.9830526113510132, - "C": 0.00020002010569442064, - "D": 0.0004798233276233077 + "A": 0.00015856498794164509, + "B": 0.9400275945663452, + "C": 0.00012349053577054292, + "D": 0.000589140341617167 }, "sample": { "messages": [ @@ -64672,10 +64672,10 @@ ] }, "predict": { - "A": 0.008501802571117878, - "B": 0.0013037936296314, - "C": 0.982674777507782, - "D": 0.00012127169611630961 + "A": 0.00846819393336773, + "B": 0.00022566970437765121, + "C": 0.9787901043891907, + "D": 4.44370016339235e-05 }, "sample": { "messages": [ @@ -64710,17 +64710,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.3290482759475708, - "B": 0.08319642394781113, - "C": 0.07342059910297394, - "D": 0.4787624180316925 + "A": 0.3942715525627136, + "B": 0.022243279963731766, + "C": 0.08797387778759003, + "D": 0.3942715525627136 }, "sample": { "messages": [ @@ -64750,7 +64750,7 @@ "prompt_len": 75, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -64762,10 +64762,10 @@ ] }, "predict": { - "A": 0.07285131514072418, - "B": 0.8875106573104858, - "C": 0.0076784719713032246, - "D": 0.003627053229138255 + "A": 0.3749542236328125, + "B": 0.5455551743507385, + "C": 0.011322636157274246, + "D": 0.0022295592352747917 }, "sample": { "messages": [ @@ -64807,10 +64807,10 @@ ] }, "predict": { - "A": 0.7915573120117188, - "B": 0.03940931707620621, - "C": 0.0036656379234045744, - "D": 0.13755202293395996 + "A": 0.7269269824028015, + "B": 0.010369055904448032, + "C": 0.0026217082049697638, + "D": 0.20826807618141174 }, "sample": { "messages": [ @@ -64852,10 +64852,10 @@ ] }, "predict": { - "A": 0.020049573853611946, - "B": 0.0057442993856966496, - "C": 0.9660426378250122, - "D": 0.00015308038564398885 + "A": 0.0031479797326028347, + "B": 0.00042603269685059786, + "C": 0.9890657663345337, + "D": 4.779954542755149e-05 }, "sample": { "messages": [ @@ -64897,10 +64897,10 @@ ] }, "predict": { - "A": 0.010486613027751446, - "B": 0.010486613027751446, - "C": 0.017289500683546066, - "D": 0.9439747333526611 + "A": 0.007279675453901291, + "B": 0.0025157886557281017, + "C": 0.008780964650213718, + "D": 0.9534494280815125 }, "sample": { "messages": [ @@ -64942,10 +64942,10 @@ ] }, "predict": { - "A": 0.9646364450454712, - "B": 0.002709465567022562, - "C": 0.0014502722769975662, - "D": 0.0007762748282402754 + "A": 0.9307925701141357, + "B": 0.00035382123314775527, + "C": 0.0008487729937769473, + "D": 0.0002933281066361815 }, "sample": { "messages": [ @@ -64987,10 +64987,10 @@ ] }, "predict": { - "A": 0.0011589920613914728, - "B": 0.0002930391055997461, - "C": 0.00015685251855757087, - "D": 0.9898473620414734 + "A": 0.00032650912180542946, + "B": 7.755270780762658e-05, + "C": 5.673877603840083e-05, + "D": 0.9733099341392517 }, "sample": { "messages": [ @@ -65032,10 +65032,10 @@ ] }, "predict": { - "A": 0.7358777523040771, - "B": 0.025180350989103317, - "C": 0.18605902791023254, - "D": 0.007214291952550411 + "A": 0.9061983823776245, + "B": 0.0012023262679576874, + "C": 0.006918909959495068, + "D": 0.00039033821667544544 }, "sample": { "messages": [ @@ -65077,10 +65077,10 @@ ] }, "predict": { - "A": 0.004426009953022003, - "B": 0.8434482216835022, - "C": 0.12934698164463043, - "D": 0.007297256961464882 + "A": 0.002057876205071807, + "B": 0.7799068689346313, + "C": 0.13552747666835785, + "D": 0.004092575516551733 }, "sample": { "messages": [ @@ -65122,10 +65122,10 @@ ] }, "predict": { - "A": 0.002529853256419301, - "B": 0.2580520808696747, - "C": 0.701458215713501, - "D": 0.0009306810097768903 + "A": 0.0008764471276663244, + "B": 0.05093860998749733, + "C": 0.902908444404602, + "D": 0.00041400431655347347 }, "sample": { "messages": [ @@ -65167,10 +65167,10 @@ ] }, "predict": { - "A": 0.5388562083244324, - "B": 0.0111836027354002, - "C": 0.0036307843402028084, - "D": 0.41966161131858826 + "A": 0.7249575853347778, + "B": 0.00191288604401052, + "C": 0.0013994973851367831, + "D": 0.2077038288116455 }, "sample": { "messages": [ @@ -65212,10 +65212,10 @@ ] }, "predict": { - "A": 0.04524548724293709, - "B": 0.012963050045073032, - "C": 0.003953506704419851, - "D": 0.9087799787521362 + "A": 0.06429940462112427, + "B": 0.005618419032543898, + "C": 0.008701978251338005, + "D": 0.8876258730888367 }, "sample": { "messages": [ @@ -65257,10 +65257,10 @@ ] }, "predict": { - "A": 0.9816478490829468, - "B": 0.0018950261874124408, - "C": 6.484422920038924e-05, - "D": 0.00015555322170257568 + "A": 0.9389489889144897, + "B": 0.0005884643760509789, + "C": 6.60238802083768e-05, + "D": 0.00019104641978628933 }, "sample": { "messages": [ @@ -65302,10 +65302,10 @@ ] }, "predict": { - "A": 0.6114935874938965, - "B": 0.04429645091295242, - "C": 0.002831777324900031, - "D": 0.3273089528083801 + "A": 0.6463974118232727, + "B": 0.07720115780830383, + "C": 0.003391982289031148, + "D": 0.20985451340675354 }, "sample": { "messages": [ @@ -65347,10 +65347,10 @@ ] }, "predict": { - "A": 0.0005502150161191821, - "B": 0.0002945089072454721, - "C": 0.9948121309280396, - "D": 0.00022936375171411783 + "A": 0.00017862730601336807, + "B": 3.517380537232384e-05, + "C": 0.9948045611381531, + "D": 4.807689037988894e-05 }, "sample": { "messages": [ @@ -65392,10 +65392,10 @@ ] }, "predict": { - "A": 0.1636604219675064, - "B": 0.010462460108101368, - "C": 0.4448751211166382, - "D": 0.34646910429000854 + "A": 0.098415806889534, + "B": 0.005552239716053009, + "C": 0.727199912071228, + "D": 0.1115197241306305 }, "sample": { "messages": [ @@ -65437,10 +65437,10 @@ ] }, "predict": { - "A": 0.011758649721741676, - "B": 0.021968049928545952, - "C": 0.002973052440211177, - "D": 0.9341052770614624 + "A": 0.0012700756778940558, + "B": 0.0023728138767182827, + "C": 0.0004672347567975521, + "D": 0.9572613835334778 }, "sample": { "messages": [ @@ -65482,10 +65482,10 @@ ] }, "predict": { - "A": 0.04173412173986435, - "B": 0.7397549152374268, - "C": 0.047290951013565063, - "D": 0.0688079223036766 + "A": 0.11640771478414536, + "B": 0.5911672115325928, + "C": 0.11640771478414536, + "D": 0.02943248115479946 }, "sample": { "messages": [ @@ -65527,10 +65527,10 @@ ] }, "predict": { - "A": 0.010525335557758808, - "B": 0.025248965248465538, - "C": 0.0005240255268290639, - "D": 0.947460412979126 + "A": 0.0063934470526874065, + "B": 0.0063934470526874065, + "C": 0.00019306536705698818, + "D": 0.9488716721534729 }, "sample": { "messages": [ @@ -65572,10 +65572,10 @@ ] }, "predict": { - "A": 0.025328481569886208, - "B": 0.0044014304876327515, - "C": 0.004987474065274, - "D": 0.9504442811012268 + "A": 0.009422563947737217, + "B": 0.00112536468077451, + "C": 0.004450904205441475, + "D": 0.9611275792121887 }, "sample": { "messages": [ @@ -65617,10 +65617,10 @@ ] }, "predict": { - "A": 0.0023951511830091476, - "B": 0.002714061876758933, - "C": 0.0030754348263144493, - "D": 0.9662728905677795 + "A": 0.0009408257901668549, + "B": 0.0014571795472875237, + "C": 0.0007327158818952739, + "D": 0.9105080366134644 }, "sample": { "messages": [ @@ -65662,10 +65662,10 @@ ] }, "predict": { - "A": 0.514141857624054, - "B": 0.21432620286941528, - "C": 0.054190151393413544, - "D": 0.18914224207401276 + "A": 0.5076495409011841, + "B": 0.041670411825180054, + "C": 0.019683707505464554, + "D": 0.3953578472137451 }, "sample": { "messages": [ @@ -65707,10 +65707,10 @@ ] }, "predict": { - "A": 0.0014809267595410347, - "B": 0.0019015476573258638, - "C": 0.9850260615348816, - "D": 0.002441635588183999 + "A": 0.00048394856275990605, + "B": 0.0004270830540917814, + "C": 0.9915042519569397, + "D": 0.0007978962385095656 }, "sample": { "messages": [ @@ -65745,17 +65745,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.45953619480133057, - "B": 0.5207227468490601, - "C": 0.0011390764266252518, - "D": 0.001657346379943192 + "A": 0.7787063121795654, + "B": 0.17375285923480988, + "C": 0.001033172244206071, + "D": 0.0026382978539913893 }, "sample": { "messages": [ @@ -65785,7 +65785,7 @@ "prompt_len": 62, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -65797,10 +65797,10 @@ ] }, "predict": { - "A": 0.006409304682165384, - "B": 0.02237067185342312, - "C": 0.004991571418941021, - "D": 0.9512251019477844 + "A": 0.0022095877211540937, + "B": 0.003422276582568884, + "C": 0.0025037911254912615, + "D": 0.948902428150177 }, "sample": { "messages": [ @@ -65842,10 +65842,10 @@ ] }, "predict": { - "A": 0.0016605273121967912, - "B": 0.0007843775092624128, - "C": 0.00022472791897598654, - "D": 0.9747052788734436 + "A": 0.0010431640548631549, + "B": 0.00020541118283290416, + "C": 0.0001599743845872581, + "D": 0.8909234404563904 }, "sample": { "messages": [ @@ -65887,10 +65887,10 @@ ] }, "predict": { - "A": 0.12950871884822845, - "B": 0.7452712655067444, - "C": 0.06117559224367142, - "D": 0.028897302225232124 + "A": 0.04295019805431366, + "B": 0.8626777529716492, + "C": 0.013943887315690517, + "D": 0.020288236439228058 }, "sample": { "messages": [ @@ -65932,10 +65932,10 @@ ] }, "predict": { - "A": 0.08962612599134445, - "B": 0.022661034017801285, - "C": 0.012129576876759529, - "D": 0.8503490090370178 + "A": 0.07281319051980972, + "B": 0.0056147500872612, + "C": 0.009854193776845932, + "D": 0.887046217918396 }, "sample": { "messages": [ @@ -65977,10 +65977,10 @@ ] }, "predict": { - "A": 0.012021436356008053, - "B": 0.0014357557520270348, - "C": 0.0007219433318823576, - "D": 0.9549809694290161 + "A": 0.0213206447660923, + "B": 0.0002683873171918094, + "C": 0.0002683873171918094, + "D": 0.9065768122673035 }, "sample": { "messages": [ @@ -66022,10 +66022,10 @@ ] }, "predict": { - "A": 0.0007902131183072925, - "B": 0.000256544619332999, - "C": 7.350127270910889e-05, - "D": 0.9819568395614624 + "A": 0.0002513870131224394, + "B": 5.609202344203368e-05, + "C": 2.3382635845337063e-05, + "D": 0.9039177298545837 }, "sample": { "messages": [ @@ -66067,10 +66067,10 @@ ] }, "predict": { - "A": 0.07255911082029343, - "B": 0.15360763669013977, - "C": 0.6884216666221619, - "D": 0.03883809596300125 + "A": 0.21337173879146576, + "B": 0.16617408394813538, + "C": 0.45170795917510986, + "D": 0.04201538488268852 }, "sample": { "messages": [ @@ -66112,10 +66112,10 @@ ] }, "predict": { - "A": 0.00631377100944519, - "B": 0.015145949088037014, - "C": 0.03206397593021393, - "D": 0.9370467066764832 + "A": 0.0012933190446346998, + "B": 0.0024162381887435913, + "C": 0.0027379565872251987, + "D": 0.9747800230979919 }, "sample": { "messages": [ @@ -66157,10 +66157,10 @@ ] }, "predict": { - "A": 0.9125843644142151, - "B": 0.0042261043563485146, - "C": 0.045434899628162384, - "D": 0.0010685264132916927 + "A": 0.770438015460968, + "B": 0.0007025485392659903, + "C": 0.05581034719944, + "D": 0.0001668698969297111 }, "sample": { "messages": [ @@ -66202,10 +66202,10 @@ ] }, "predict": { - "A": 0.0085074407979846, - "B": 0.9833264946937561, - "C": 0.00012135213182773441, - "D": 0.0016752146184444427 + "A": 0.013540093787014484, + "B": 0.9492339491844177, + "C": 0.00011004733823938295, + "D": 0.0013406509533524513 }, "sample": { "messages": [ @@ -66247,10 +66247,10 @@ ] }, "predict": { - "A": 0.927487850189209, - "B": 0.01924939453601837, - "C": 0.008024342358112335, - "D": 0.0009583709761500359 + "A": 0.8971880674362183, + "B": 0.0023673365358263254, + "C": 0.004154805559664965, + "D": 0.000496220716740936 }, "sample": { "messages": [ @@ -66292,10 +66292,10 @@ ] }, "predict": { - "A": 0.0035201613791286945, - "B": 0.0014674215344712138, - "C": 0.00274150469340384, - "D": 0.976043164730072 + "A": 0.0010972128948196769, + "B": 0.00016826303908601403, + "C": 0.0005872956826351583, + "D": 0.9370843172073364 }, "sample": { "messages": [ @@ -66337,10 +66337,10 @@ ] }, "predict": { - "A": 0.02009926177561283, - "B": 0.0034927281085401773, - "C": 0.9684366583824158, - "D": 0.0011339227203279734 + "A": 0.013932797126471996, + "B": 0.00039524302701465786, + "C": 0.9767645001411438, + "D": 0.00022520268976222724 }, "sample": { "messages": [ @@ -66382,10 +66382,10 @@ ] }, "predict": { - "A": 0.8218938708305359, - "B": 0.059537794440984726, - "C": 0.024819044396281242, - "D": 0.059537794440984726 + "A": 0.8990674614906311, + "B": 0.000872718752361834, + "C": 0.000872718752361834, + "D": 0.00367428339086473 }, "sample": { "messages": [ @@ -66427,10 +66427,10 @@ ] }, "predict": { - "A": 0.9133128523826599, - "B": 0.011496912688016891, - "C": 0.0037325010634958744, - "D": 0.010145990177989006 + "A": 0.8834092020988464, + "B": 0.002481314819306135, + "C": 0.001413810532540083, + "D": 0.005952364299446344 }, "sample": { "messages": [ @@ -66472,10 +66472,10 @@ ] }, "predict": { - "A": 0.005755469668656588, - "B": 0.009489165619015694, - "C": 0.0011333191068843007, - "D": 0.9679211974143982 + "A": 0.0018829815089702606, + "B": 0.0012941529275849462, + "C": 0.0003272131143603474, + "D": 0.9754085540771484 }, "sample": { "messages": [ @@ -66517,10 +66517,10 @@ ] }, "predict": { - "A": 0.009648027829825878, - "B": 0.0007919583003968, - "C": 0.9841254949569702, - "D": 0.00010717990517150611 + "A": 0.0040437993593513966, + "B": 0.00015679511125199497, + "C": 0.9894850850105286, + "D": 3.9644008211325854e-05 }, "sample": { "messages": [ @@ -66562,10 +66562,10 @@ ] }, "predict": { - "A": 0.013320370577275753, - "B": 0.02488572895526886, - "C": 0.015093958005309105, - "D": 0.9338300824165344 + "A": 0.006473633926361799, + "B": 0.00346508645452559, + "C": 0.005712961778044701, + "D": 0.9607724547386169 }, "sample": { "messages": [ @@ -66607,10 +66607,10 @@ ] }, "predict": { - "A": 0.1300588846206665, - "B": 0.0018551902612671256, - "C": 0.0039274380542337894, - "D": 0.848090410232544 + "A": 0.2588857114315033, + "B": 0.00032267378992401063, + "C": 0.0016386726638302207, + "D": 0.7037242650985718 }, "sample": { "messages": [ @@ -66652,10 +66652,10 @@ ] }, "predict": { - "A": 0.025623716413974762, - "B": 0.0003655026957858354, - "C": 0.0006026119808666408, - "D": 0.9615228176116943 + "A": 0.01336299441754818, + "B": 0.00012306893768254668, + "C": 0.0008025101851671934, + "D": 0.9368182420730591 }, "sample": { "messages": [ @@ -66697,10 +66697,10 @@ ] }, "predict": { - "A": 0.00042885506991297007, - "B": 0.00022954956511966884, - "C": 0.9956181049346924, - "D": 0.0010287697659805417 + "A": 0.00027677667094394565, + "B": 4.518255082075484e-05, + "C": 0.9952118396759033, + "D": 0.00012281890667509288 }, "sample": { "messages": [ @@ -66742,10 +66742,10 @@ ] }, "predict": { - "A": 0.9615845680236816, - "B": 0.0034680154640227556, - "C": 0.00041419538320042193, - "D": 0.00046934487181715667 + "A": 0.8923215866088867, + "B": 0.00036107361665926874, + "C": 0.0002993405214510858, + "D": 0.00015051788068376482 }, "sample": { "messages": [ @@ -66787,10 +66787,10 @@ ] }, "predict": { - "A": 0.012245429679751396, - "B": 0.005104654468595982, - "C": 0.9727749228477478, - "D": 0.000833311933092773 + "A": 0.046734850853681564, + "B": 0.0010325073963031173, + "C": 0.9386946558952332, + "D": 0.0002452420012559742 }, "sample": { "messages": [ @@ -66832,10 +66832,10 @@ ] }, "predict": { - "A": 0.846567690372467, - "B": 0.006463623605668545, - "C": 0.0892275795340538, - "D": 0.007324245758354664 + "A": 0.7190998792648315, + "B": 0.0010811229003593326, + "C": 0.06688670814037323, + "D": 0.0013040831545367837 }, "sample": { "messages": [ @@ -66870,17 +66870,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.007880505174398422, - "B": 0.6260260939598083, - "C": 0.3350876271724701, - "D": 0.002558425534516573 + "A": 0.007373083382844925, + "B": 0.1901543140411377, + "C": 0.7520748972892761, + "D": 0.0008272369159385562 }, "sample": { "messages": [ @@ -66910,7 +66910,7 @@ "prompt_len": 80, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -66922,10 +66922,10 @@ ] }, "predict": { - "A": 0.0010093136224895716, - "B": 0.9767888784408569, - "C": 0.0007860542391426861, - "D": 0.00013659575779456645 + "A": 0.0011886446736752987, + "B": 0.8958864808082581, + "C": 0.0006362356361933053, + "D": 9.165843221126124e-05 }, "sample": { "messages": [ @@ -66967,10 +66967,10 @@ ] }, "predict": { - "A": 0.00403174851089716, - "B": 0.0013089170679450035, - "C": 0.9865363240242004, - "D": 0.00012174824951216578 + "A": 0.0035259337164461613, + "B": 0.000289426272502169, + "C": 0.9776436686515808, + "D": 5.029474414186552e-05 }, "sample": { "messages": [ @@ -67012,10 +67012,10 @@ ] }, "predict": { - "A": 0.38735464215278625, - "B": 0.015019369311630726, - "C": 0.5635976791381836, - "D": 0.0029574891086667776 + "A": 0.07375819981098175, + "B": 0.0025238669477403164, + "C": 0.8985587954521179, + "D": 0.0002498966350685805 }, "sample": { "messages": [ @@ -67057,10 +67057,10 @@ ] }, "predict": { - "A": 0.0953725129365921, - "B": 0.6219068765640259, - "C": 0.22878678143024445, - "D": 0.0030657444149255753 + "A": 0.03384856879711151, + "B": 0.7703898549079895, + "C": 0.11814311891794205, + "D": 0.00666517810896039 }, "sample": { "messages": [ @@ -67102,10 +67102,10 @@ ] }, "predict": { - "A": 0.954276978969574, - "B": 0.004419179633259773, - "C": 0.0008701879414729774, - "D": 0.00041104768752120435 + "A": 0.8789150714874268, + "B": 0.0011661272728815675, + "C": 0.00029484316473826766, + "D": 0.00011546224413905293 }, "sample": { "messages": [ @@ -67147,10 +67147,10 @@ ] }, "predict": { - "A": 0.009450201876461506, - "B": 0.017655301839113235, - "C": 0.9639467597007751, - "D": 0.0004152128822170198 + "A": 0.0031405535992234945, + "B": 0.002158468822017312, + "C": 0.986732542514801, + "D": 6.518010195577517e-05 }, "sample": { "messages": [ @@ -67192,10 +67192,10 @@ ] }, "predict": { - "A": 0.005799249280244112, - "B": 0.11648103594779968, - "C": 0.0021334246266633272, - "D": 0.8606849312782288 + "A": 0.004959840327501297, + "B": 0.02518814615905285, + "C": 0.0028260317631065845, + "D": 0.945178210735321 }, "sample": { "messages": [ @@ -67237,10 +67237,10 @@ ] }, "predict": { - "A": 0.06429915130138397, - "B": 0.8876223564147949, - "C": 0.003407730022445321, - "D": 0.026803871616721153 + "A": 0.281185120344162, + "B": 0.5952688455581665, + "C": 0.001894610351882875, + "D": 0.10344220697879791 }, "sample": { "messages": [ @@ -67282,10 +67282,10 @@ ] }, "predict": { - "A": 0.5657117366790771, - "B": 0.01708301343023777, - "C": 0.38880759477615356, - "D": 0.004319261759519577 + "A": 0.5961216688156128, + "B": 0.0016743831802159548, + "C": 0.3190809488296509, + "D": 0.0005106579628773034 }, "sample": { "messages": [ @@ -67327,10 +67327,10 @@ ] }, "predict": { - "A": 0.9252609014511108, - "B": 0.009070939384400845, - "C": 0.014955449849367142, - "D": 0.0010833691339939833 + "A": 0.8584477305412292, + "B": 0.0010699649574235082, + "C": 0.007427028380334377, + "D": 0.00022427680960390717 }, "sample": { "messages": [ @@ -67372,10 +67372,10 @@ ] }, "predict": { - "A": 0.1874060183763504, - "B": 0.02873964235186577, - "C": 0.7412051558494568, - "D": 0.011980465613305569 + "A": 0.12481009215116501, + "B": 0.0029352521523833275, + "C": 0.8138640522956848, + "D": 0.00130251026712358 }, "sample": { "messages": [ @@ -67417,10 +67417,10 @@ ] }, "predict": { - "A": 0.8729092478752136, - "B": 0.029869310557842255, - "C": 0.04345959052443504, - "D": 0.018116652965545654 + "A": 0.8989405035972595, + "B": 0.014530019834637642, + "C": 0.012822696939110756, + "D": 0.006057013291865587 }, "sample": { "messages": [ @@ -67462,10 +67462,10 @@ ] }, "predict": { - "A": 0.004884724970906973, - "B": 0.03609350696206093, - "C": 0.00015701923985034227, - "D": 0.9308638572692871 + "A": 0.0020328855607658625, + "B": 0.00804021768271923, + "C": 3.9634953282075e-05, + "D": 0.9293228387832642 }, "sample": { "messages": [ @@ -67500,17 +67500,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "C" ] }, "predict": { - "A": 0.7013245224952698, - "B": 0.034916892647743225, - "C": 0.22768673300743103, - "D": 0.0011947904713451862 + "A": 0.3448874354362488, + "B": 0.0033811572939157486, + "C": 0.5686232447624207, + "D": 0.00021615016157738864 }, "sample": { "messages": [ @@ -67540,7 +67540,7 @@ "prompt_len": 106, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -67552,10 +67552,10 @@ ] }, "predict": { - "A": 0.8334912657737732, - "B": 0.0775267630815506, - "C": 0.0008612445672042668, - "D": 0.06037790700793266 + "A": 0.9409681558609009, + "B": 0.0018164959037676454, + "C": 0.00011612469825195149, + "D": 0.003393661230802536 }, "sample": { "messages": [ @@ -67597,10 +67597,10 @@ ] }, "predict": { - "A": 0.0011552257928997278, - "B": 0.0013090423308312893, - "C": 0.9866307377815247, - "D": 0.00022747744515072554 + "A": 0.0007433737628161907, + "B": 0.00021298014326021075, + "C": 0.9833289384841919, + "D": 5.7322828070027754e-05 }, "sample": { "messages": [ @@ -67642,10 +67642,10 @@ ] }, "predict": { - "A": 0.0007841205806471407, - "B": 0.0035141846165060997, - "C": 0.9743859767913818, - "D": 0.009552544914186 + "A": 0.00022705654555466026, + "B": 0.0010175968054682016, + "C": 0.984805166721344, + "D": 0.0010175968054682016 }, "sample": { "messages": [ @@ -67687,10 +67687,10 @@ ] }, "predict": { - "A": 0.0014708420494571328, - "B": 0.9783183336257935, - "C": 0.0007872850401327014, - "D": 0.0012980136089026928 + "A": 0.0007124484982341528, + "B": 0.8853228092193604, + "C": 0.0005906405276618898, + "D": 0.0009738015942275524 }, "sample": { "messages": [ @@ -67732,10 +67732,10 @@ ] }, "predict": { - "A": 0.001891334424726665, - "B": 0.9797354936599731, - "C": 0.0001067017947207205, - "D": 0.0001067017947207205 + "A": 0.0004653209471143782, + "B": 0.895580530166626, + "C": 0.0001176515725092031, + "D": 0.00010382714390289038 }, "sample": { "messages": [ @@ -67777,10 +67777,10 @@ ] }, "predict": { - "A": 0.09331591427326202, - "B": 0.17433708906173706, - "C": 0.689516544342041, - "D": 0.006759782787412405 + "A": 0.07081654667854309, + "B": 0.09093024581670761, + "C": 0.761349618434906, + "D": 0.003111461875960231 }, "sample": { "messages": [ @@ -67822,10 +67822,10 @@ ] }, "predict": { - "A": 0.8458367586135864, - "B": 0.010647513903677464, - "C": 0.06943050771951675, - "D": 0.047718845307826996 + "A": 0.8782873749732971, + "B": 0.001094692968763411, + "C": 0.03858925402164459, + "D": 0.008610427379608154 }, "sample": { "messages": [ @@ -67867,10 +67867,10 @@ ] }, "predict": { - "A": 0.009451749734580517, - "B": 0.002707971725612879, - "C": 0.0007758468273095787, - "D": 0.9641046524047852 + "A": 0.004766744561493397, + "B": 0.0005348139093257487, + "C": 0.00019674703071359545, + "D": 0.9083807468414307 }, "sample": { "messages": [ @@ -67912,10 +67912,10 @@ ] }, "predict": { - "A": 0.9643018245697021, - "B": 0.006497414316982031, - "C": 0.0009964107302948833, - "D": 0.0011290812399238348 + "A": 0.9222765564918518, + "B": 0.001014448469504714, + "C": 0.00035058404318988323, + "D": 0.00019975674513261765 }, "sample": { "messages": [ @@ -67957,10 +67957,10 @@ ] }, "predict": { - "A": 0.004574007820338011, - "B": 0.00037545739905908704, - "C": 0.9877105355262756, - "D": 0.0007948433049023151 + "A": 0.004521289374679327, + "B": 8.815110777504742e-05, + "C": 0.976326584815979, + "D": 9.383636643178761e-05 }, "sample": { "messages": [ @@ -68002,10 +68002,10 @@ ] }, "predict": { - "A": 0.0018970360979437828, - "B": 0.008501926437020302, - "C": 0.9826890230178833, - "D": 0.00022656864894088358 + "A": 0.00013751070946455002, + "B": 0.00025690378970466554, + "C": 0.9833316802978516, + "D": 1.9810295270872302e-05 }, "sample": { "messages": [ @@ -68047,10 +68047,10 @@ ] }, "predict": { - "A": 0.005147372838109732, - "B": 0.00454254075884819, - "C": 0.0005096589447930455, - "D": 0.9809156060218811 + "A": 0.00452156038954854, + "B": 0.005805797874927521, + "C": 0.00021147617371752858, + "D": 0.9763851165771484 }, "sample": { "messages": [ @@ -68092,10 +68092,10 @@ ] }, "predict": { - "A": 0.07230410724878311, - "B": 0.02659919299185276, - "C": 0.8808442950248718, - "D": 0.0011686870129778981 + "A": 0.008364210836589336, + "B": 0.005748632363975048, + "C": 0.9667713046073914, + "D": 0.000286207563476637 }, "sample": { "messages": [ @@ -68137,10 +68137,10 @@ ] }, "predict": { - "A": 0.03170573711395264, - "B": 0.9265775084495544, - "C": 0.019230501726269722, - "D": 0.000658031611237675 + "A": 0.02402728796005249, + "B": 0.9016173481941223, + "C": 0.007800518535077572, + "D": 0.00036483508301898837 }, "sample": { "messages": [ @@ -68182,10 +68182,10 @@ ] }, "predict": { - "A": 0.003474998055025935, - "B": 0.003474998055025935, - "C": 0.007356570102274418, - "D": 0.9635205864906311 + "A": 0.0020878356881439686, + "B": 0.0009862236911430955, + "C": 0.012014664709568024, + "D": 0.9544429779052734 }, "sample": { "messages": [ @@ -68227,10 +68227,10 @@ ] }, "predict": { - "A": 0.5874062180519104, - "B": 0.31441590189933777, - "C": 0.005758728366345167, - "D": 0.061912160366773605 + "A": 0.7837426662445068, + "B": 0.08260586857795715, + "C": 0.004660304170101881, + "D": 0.05010299012064934 }, "sample": { "messages": [ @@ -68265,17 +68265,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.005086475517600775, - "B": 0.8554136753082275, - "C": 0.10216459631919861, - "D": 0.0018712098244577646 + "A": 0.0030162730254232883, + "B": 0.032427988946437836, + "C": 0.9476847648620605, + "D": 0.0005939393304288387 }, "sample": { "messages": [ @@ -68305,7 +68305,7 @@ "prompt_len": 66, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -68317,10 +68317,10 @@ ] }, "predict": { - "A": 0.006506274454295635, - "B": 0.005741766653954983, - "C": 0.0050670914351940155, - "D": 0.9656167030334473 + "A": 0.0044450536370277405, + "B": 0.002379265846684575, + "C": 0.006467514671385288, + "D": 0.9598642587661743 }, "sample": { "messages": [ @@ -68362,10 +68362,10 @@ ] }, "predict": { - "A": 0.969018280506134, - "B": 0.005761993117630482, - "C": 0.00032506941352039576, - "D": 0.0006073095719330013 + "A": 0.9387235045433044, + "B": 0.0007554218173027039, + "C": 0.00011584768071770668, + "D": 0.00011584768071770668 }, "sample": { "messages": [ @@ -68407,10 +68407,10 @@ ] }, "predict": { - "A": 0.0016393903642892838, - "B": 0.9622981548309326, - "C": 0.0006833994993939996, - "D": 0.0003228150599170476 + "A": 0.002253561047837138, + "B": 0.8023232221603394, + "C": 0.0008290386758744717, + "D": 0.00022313273802865297 }, "sample": { "messages": [ @@ -68452,10 +68452,10 @@ ] }, "predict": { - "A": 0.032000474631786346, - "B": 0.12656433880329132, - "C": 0.8253031373023987, - "D": 0.001406002906151116 + "A": 0.028330156579613686, + "B": 0.013382217846810818, + "C": 0.9381659626960754, + "D": 0.0006662614177912474 }, "sample": { "messages": [ @@ -68497,10 +68497,10 @@ ] }, "predict": { - "A": 0.9159330725669861, - "B": 0.005446337163448334, - "C": 0.035514578223228455, - "D": 0.0008352228906005621 + "A": 0.8907756209373474, + "B": 0.00040844117756932974, + "C": 0.01439804770052433, + "D": 0.00012456752301659435 }, "sample": { "messages": [ @@ -68542,10 +68542,10 @@ ] }, "predict": { - "A": 0.0065573775209486485, - "B": 0.008419839665293694, - "C": 0.9732010960578918, - "D": 0.0011395012261345983 + "A": 0.00843451265245676, + "B": 0.002416528295725584, + "C": 0.9748970866203308, + "D": 0.0006503997719846666 }, "sample": { "messages": [ @@ -68587,10 +68587,10 @@ ] }, "predict": { - "A": 0.003924431279301643, - "B": 0.025590505450963974, - "C": 0.9602766633033752, - "D": 0.0008756589377298951 + "A": 0.000962807796895504, + "B": 0.0010910042328760028, + "C": 0.991876482963562, + "D": 9.533089178148657e-05 }, "sample": { "messages": [ @@ -68632,10 +68632,10 @@ ] }, "predict": { - "A": 0.019993837922811508, - "B": 0.003937020432204008, - "C": 0.9633570909500122, - "D": 0.005055233836174011 + "A": 0.008448895998299122, + "B": 0.0007858697790652514, + "C": 0.9765596389770508, + "D": 0.004522368311882019 }, "sample": { "messages": [ @@ -68677,10 +68677,10 @@ ] }, "predict": { - "A": 0.0505470372736454, - "B": 0.8959676027297974, - "C": 0.021071139723062515, - "D": 0.01278029102832079 + "A": 0.05230723321437836, + "B": 0.8182228207588196, + "C": 0.06716381758451462, + "D": 0.011671321466565132 }, "sample": { "messages": [ @@ -68722,10 +68722,10 @@ ] }, "predict": { - "A": 0.015223369002342224, - "B": 0.025099091231822968, - "C": 0.002334579359740019, - "D": 0.9418364763259888 + "A": 0.005724163260310888, + "B": 0.007349970750510693, + "C": 0.0030639241449534893, + "D": 0.9626562595367432 }, "sample": { "messages": [ @@ -68767,10 +68767,10 @@ ] }, "predict": { - "A": 0.004032415803521872, - "B": 0.0013091337168589234, - "C": 0.9866995811462402, - "D": 0.001155306352302432 + "A": 0.0019044253276661038, + "B": 0.0002920530969277024, + "C": 0.9865167140960693, + "D": 0.00022745114983990788 }, "sample": { "messages": [ @@ -68812,10 +68812,10 @@ ] }, "predict": { - "A": 0.025287052616477013, - "B": 0.01533737313002348, - "C": 0.9488896727561951, - "D": 0.0018317879876121879 + "A": 0.04651568830013275, + "B": 0.0023158800322562456, + "C": 0.9342926144599915, + "D": 0.0007063034572638571 }, "sample": { "messages": [ @@ -68857,10 +68857,10 @@ ] }, "predict": { - "A": 0.008419093675911427, - "B": 0.9731149673461914, - "C": 0.0006098770536482334, - "D": 0.0002880855754483491 + "A": 0.018875639885663986, + "B": 0.9094793200492859, + "C": 0.0015494071412831545, + "D": 0.0004170172323938459 }, "sample": { "messages": [ @@ -68902,10 +68902,10 @@ ] }, "predict": { - "A": 0.9732374548912048, - "B": 0.003097601467743516, - "C": 0.00019802304450422525, - "D": 0.002128948224708438 + "A": 0.9132676124572754, + "B": 0.0010693264193832874, + "C": 7.746174378553405e-05, + "D": 0.00034715948277153075 }, "sample": { "messages": [ @@ -68947,10 +68947,10 @@ ] }, "predict": { - "A": 0.07449883222579956, - "B": 0.0019853266421705484, - "C": 0.0008276072912849486, - "D": 0.907581627368927 + "A": 0.03170691058039665, + "B": 0.0005124944727867842, + "C": 0.0005124944727867842, + "D": 0.9266117811203003 }, "sample": { "messages": [ @@ -68992,10 +68992,10 @@ ] }, "predict": { - "A": 0.11621612310409546, - "B": 0.005106179974973202, - "C": 0.8587274551391602, - "D": 0.0011393427848815918 + "A": 0.15671668946743011, + "B": 0.0018532478716224432, + "C": 0.7958730459213257, + "D": 0.0003649261489044875 }, "sample": { "messages": [ @@ -69037,10 +69037,10 @@ ] }, "predict": { - "A": 0.01648666150867939, - "B": 0.050782497972249985, - "C": 0.007787747774273157, - "D": 0.9001412391662598 + "A": 0.014188406057655811, + "B": 0.014188406057655811, + "C": 0.06358803063631058, + "D": 0.8778055906295776 }, "sample": { "messages": [ @@ -69082,10 +69082,10 @@ ] }, "predict": { - "A": 0.0019149213330820203, - "B": 0.0007044601952657104, - "C": 0.9919537901878357, - "D": 0.0009045447804965079 + "A": 0.0002596326812636107, + "B": 4.8027221055235714e-05, + "C": 0.993776798248291, + "D": 5.442196925287135e-05 }, "sample": { "messages": [ @@ -69127,10 +69127,10 @@ ] }, "predict": { - "A": 0.0027182491030544043, - "B": 0.00395502895116806, - "C": 0.0005697759333997965, - "D": 0.9677636027336121 + "A": 0.0014083811547607183, + "B": 0.0007538521313108504, + "C": 0.00024473993107676506, + "D": 0.9367729425430298 }, "sample": { "messages": [ @@ -69172,10 +69172,10 @@ ] }, "predict": { - "A": 0.9468291401863098, - "B": 0.004384689033031464, - "C": 0.0026594484224915504, - "D": 0.006379684433341026 + "A": 0.8782460689544678, + "B": 0.0004026960814371705, + "C": 0.0002946187451016158, + "D": 0.0010946415131911635 }, "sample": { "messages": [ @@ -69217,10 +69217,10 @@ ] }, "predict": { - "A": 0.007499639876186848, - "B": 0.0016733960947021842, - "C": 0.9822590351104736, - "D": 8.331347635248676e-05 + "A": 0.0045267920941114426, + "B": 0.0002718549803830683, + "C": 0.9775148034095764, + "D": 3.4562483051558957e-05 }, "sample": { "messages": [ @@ -69262,10 +69262,10 @@ ] }, "predict": { - "A": 0.005163931287825108, - "B": 0.004021673928946257, - "C": 0.9840711355209351, - "D": 0.0014794910093769431 + "A": 0.0019125549588352442, + "B": 0.0004267487092874944, + "C": 0.9907280206680298, + "D": 0.00013015100557822734 }, "sample": { "messages": [ @@ -69307,10 +69307,10 @@ ] }, "predict": { - "A": 0.5358371734619141, - "B": 0.22337016463279724, - "C": 0.19712349772453308, - "D": 0.003610442392528057 + "A": 0.5573801398277283, + "B": 0.013108325190842152, + "C": 0.3380681574344635, + "D": 0.0004774706903845072 }, "sample": { "messages": [ @@ -69352,10 +69352,10 @@ ] }, "predict": { - "A": 0.9779217839241028, - "B": 0.007466524373739958, - "C": 9.398967813467607e-05, - "D": 5.70076190342661e-05 + "A": 0.9108114242553711, + "B": 0.012992027215659618, + "C": 9.919535659719259e-05, + "D": 3.428099807933904e-05 }, "sample": { "messages": [ @@ -69393,14 +69393,14 @@ "acc": false, "f1_macro": [ "D", - "A" + "C" ] }, "predict": { - "A": 0.6126969456672668, - "B": 0.2554100751876831, - "C": 0.023756839334964752, - "D": 0.05698969215154648 + "A": 0.22338570654392242, + "B": 0.038818616420030594, + "C": 0.6880764365196228, + "D": 0.004636222496628761 }, "sample": { "messages": [ @@ -69430,7 +69430,7 @@ "prompt_len": 62, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -69442,10 +69442,10 @@ ] }, "predict": { - "A": 0.003495153971016407, - "B": 0.01566419191658497, - "C": 0.969109296798706, - "D": 0.00240218173712492 + "A": 0.0014877517241984606, + "B": 0.00191031105350703, + "C": 0.9895656108856201, + "D": 0.0004004229267593473 }, "sample": { "messages": [ @@ -69487,10 +69487,10 @@ ] }, "predict": { - "A": 0.0009041366865858436, - "B": 0.0007041423232294619, - "C": 0.9915062785148621, - "D": 0.00010144132102141157 + "A": 0.0004260079294908792, + "B": 8.388593414565548e-05, + "C": 0.9890083074569702, + "D": 3.2850202842382714e-05 }, "sample": { "messages": [ @@ -69532,10 +69532,10 @@ ] }, "predict": { - "A": 0.017536703497171402, - "B": 0.957471489906311, - "C": 0.0004124237166251987, - "D": 0.00077050895197317 + "A": 0.02134140580892563, + "B": 0.9074596762657166, + "C": 0.0004160911776125431, + "D": 0.00020922377007082105 }, "sample": { "messages": [ @@ -69577,10 +69577,10 @@ ] }, "predict": { - "A": 0.012456909753382206, - "B": 0.09204480051994324, - "C": 0.8732967972755432, - "D": 0.0031496002338826656 + "A": 0.001989588839933276, + "B": 0.012187710031867027, + "C": 0.9681897759437561, + "D": 0.00047256884863600135 }, "sample": { "messages": [ @@ -69622,10 +69622,10 @@ ] }, "predict": { - "A": 0.0012513138353824615, - "B": 0.003401423804461956, - "C": 0.001104280585423112, - "D": 0.9431204795837402 + "A": 0.00022633402841165662, + "B": 0.0004228476027492434, + "C": 0.0005429470329545438, + "D": 0.9221948385238647 }, "sample": { "messages": [ @@ -69667,10 +69667,10 @@ ] }, "predict": { - "A": 0.013586224988102913, - "B": 0.011989802122116089, - "C": 0.9524679183959961, - "D": 0.003435135819017887 + "A": 0.0024285344406962395, + "B": 0.0012999007012695074, + "C": 0.9797406196594238, + "D": 0.0003724278067238629 }, "sample": { "messages": [ @@ -69712,10 +69712,10 @@ ] }, "predict": { - "A": 0.9514920711517334, - "B": 0.00565777812153101, - "C": 0.0023585131857544184, - "D": 0.004406282678246498 + "A": 0.9128497242927551, + "B": 0.0008860970847308636, + "C": 0.00022404045739676803, + "D": 0.0004455576417967677 }, "sample": { "messages": [ @@ -69757,10 +69757,10 @@ ] }, "predict": { - "A": 0.0016192822949960828, - "B": 0.9504950046539307, - "C": 0.0006750171887688339, - "D": 0.003428020281717181 + "A": 0.0031161485239863396, + "B": 0.8640215992927551, + "C": 0.0014719645259901881, + "D": 0.005821731872856617 }, "sample": { "messages": [ @@ -69795,17 +69795,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "B", - "A" + "B" ] }, "predict": { - "A": 0.5477862358093262, - "B": 0.4266163408756256, - "C": 0.0010574761545285583, - "D": 0.0004995163762941957 + "A": 0.3058781027793884, + "B": 0.6475439667701721, + "C": 0.0008070954936556518, + "D": 0.00043200707295909524 }, "sample": { "messages": [ @@ -69835,7 +69835,7 @@ "prompt_len": 68, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " B" } } { @@ -69847,10 +69847,10 @@ ] }, "predict": { - "A": 0.9202665686607361, - "B": 0.007026321720331907, - "C": 0.002281112829223275, - "D": 0.014874723739922047 + "A": 0.8756164908409119, + "B": 0.0014917176449671388, + "C": 0.0004842897760681808, + "D": 0.0027868954930454493 }, "sample": { "messages": [ @@ -69892,10 +69892,10 @@ ] }, "predict": { - "A": 0.7163177132606506, - "B": 0.03147280216217041, - "C": 0.0010769400978460908, - "D": 0.23255430161952972 + "A": 0.8471485376358032, + "B": 0.005708042066544294, + "C": 0.00046854460379108787, + "D": 0.11464908719062805 }, "sample": { "messages": [ @@ -69937,10 +69937,10 @@ ] }, "predict": { - "A": 0.08998972922563553, - "B": 0.753474771976471, - "C": 0.022752966731786728, - "D": 0.03310537338256836 + "A": 0.2905957102775574, + "B": 0.422814279794693, + "C": 0.09434261918067932, + "D": 0.030628563836216927 }, "sample": { "messages": [ @@ -69982,10 +69982,10 @@ ] }, "predict": { - "A": 0.9729185700416565, - "B": 0.00397609593346715, - "C": 0.0007829397218301892, - "D": 0.001139170490205288 + "A": 0.9305480718612671, + "B": 0.000903276726603508, + "C": 0.0003121641057077795, + "D": 0.0003765417786780745 }, "sample": { "messages": [ @@ -70027,10 +70027,10 @@ ] }, "predict": { - "A": 0.06602190434932709, - "B": 0.008935092948377132, - "C": 0.9114041924476624, - "D": 0.00032546071452088654 + "A": 0.01785735972225666, + "B": 0.00041996483923867345, + "C": 0.9749787449836731, + "D": 7.768567593302578e-05 }, "sample": { "messages": [ @@ -70072,10 +70072,10 @@ ] }, "predict": { - "A": 0.059070006012916565, - "B": 0.9240103960037231, - "C": 0.0005110556958243251, - "D": 0.0006562084890902042 + "A": 0.03472598269581795, + "B": 0.8955949544906616, + "C": 0.0003004386380780488, + "D": 0.00036239816108718514 }, "sample": { "messages": [ @@ -70113,14 +70113,14 @@ "acc": false, "f1_macro": [ "D", - "B" + "A" ] }, "predict": { - "A": 0.3505409061908722, - "B": 0.3972148895263672, - "C": 0.07821624726057053, - "D": 0.14612720906734467 + "A": 0.7121643424034119, + "B": 0.03129031881690025, + "C": 0.024368923157453537, + "D": 0.12375559657812119 }, "sample": { "messages": [ @@ -70150,7 +70150,7 @@ "prompt_len": 69, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -70162,10 +70162,10 @@ ] }, "predict": { - "A": 0.18823224306106567, - "B": 0.7444729208946228, - "C": 0.01750832609832287, - "D": 0.012033284641802311 + "A": 0.15196406841278076, + "B": 0.771737277507782, + "C": 0.006272307131439447, + "D": 0.006676835473626852 }, "sample": { "messages": [ @@ -70207,10 +70207,10 @@ ] }, "predict": { - "A": 0.0014512615744024515, - "B": 0.017679985612630844, - "C": 0.00013498835323844105, - "D": 0.9652944803237915 + "A": 0.00098790496122092, + "B": 0.005016995593905449, + "C": 7.617905066581443e-05, + "D": 0.9560701251029968 }, "sample": { "messages": [ @@ -70252,10 +70252,10 @@ ] }, "predict": { - "A": 0.09315597265958786, - "B": 0.011125895194709301, - "C": 0.8838393092155457, - "D": 0.00020377786131575704 + "A": 0.01776905730366707, + "B": 0.002122211270034313, + "C": 0.9701576828956604, + "D": 4.6885757910786197e-05 }, "sample": { "messages": [ @@ -70290,17 +70290,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.2785281836986542, - "B": 0.5203590989112854, - "C": 0.020176513120532036, - "D": 0.13156738877296448 + "A": 0.38606521487236023, + "B": 0.1823642998933792, + "C": 0.0461089164018631, + "D": 0.2653384804725647 }, "sample": { "messages": [ @@ -70330,7 +70330,7 @@ "prompt_len": 101, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -70342,10 +70342,10 @@ ] }, "predict": { - "A": 6.444491737056524e-05, - "B": 0.9756028652191162, - "C": 9.37668010010384e-05, - "D": 0.00019850431999657303 + "A": 4.58860959042795e-05, + "B": 0.8919470906257629, + "C": 9.125538781518117e-05, + "D": 0.00019318766135256737 }, "sample": { "messages": [ @@ -70387,10 +70387,10 @@ ] }, "predict": { - "A": 0.0013157608918845654, - "B": 0.0009043083409778774, - "C": 0.9916945099830627, - "D": 0.00042716501047834754 + "A": 0.0007482750806957483, + "B": 0.00017773089348338544, + "C": 0.9898123145103455, + "D": 9.513249824522063e-05 }, "sample": { "messages": [ @@ -70432,10 +70432,10 @@ ] }, "predict": { - "A": 0.014353184960782528, - "B": 0.016264287754893303, - "C": 0.475312203168869, - "D": 0.475312203168869 + "A": 0.012638329528272152, + "B": 0.003854472888633609, + "C": 0.8860153555870056, + "D": 0.08241225779056549 }, "sample": { "messages": [ @@ -70477,10 +70477,10 @@ ] }, "predict": { - "A": 0.019884644076228142, - "B": 0.0014404387911781669, - "C": 0.0006004641763865948, - "D": 0.9580958485603333 + "A": 0.0042100572027266026, + "B": 0.00030497548868879676, + "C": 0.0002096063835779205, + "D": 0.909119188785553 }, "sample": { "messages": [ @@ -70522,10 +70522,10 @@ ] }, "predict": { - "A": 0.009873242117464542, - "B": 0.08266764134168625, - "C": 0.8887609243392944, - "D": 0.0008104450535029173 + "A": 0.011766317300498486, + "B": 0.036242809146642685, + "C": 0.9347143769264221, + "D": 0.0006235920009203255 }, "sample": { "messages": [ @@ -70567,10 +70567,10 @@ ] }, "predict": { - "A": 0.016281800344586372, - "B": 0.4199131727218628, - "C": 0.0010408606613054872, - "D": 0.5391792058944702 + "A": 0.006438809912651777, + "B": 0.06502968072891235, + "C": 0.00046642584493383765, + "D": 0.8977070450782776 }, "sample": { "messages": [ @@ -70612,10 +70612,10 @@ ] }, "predict": { - "A": 0.8957213759422302, - "B": 0.07352528721094131, - "C": 0.0008167921914719045, - "D": 0.00020651739032473415 + "A": 0.8936945796012878, + "B": 0.021017683669924736, + "C": 0.0004362081235740334, + "D": 9.14341799216345e-05 }, "sample": { "messages": [ @@ -70657,10 +70657,10 @@ ] }, "predict": { - "A": 0.009514993987977505, - "B": 0.9705557227134705, - "C": 0.00032558516249991953, - "D": 0.00032558516249991953 + "A": 0.002551192184910178, + "B": 0.9082872867584229, + "C": 0.00015321098908316344, + "D": 0.0001967267889995128 }, "sample": { "messages": [ @@ -70702,10 +70702,10 @@ ] }, "predict": { - "A": 0.9554697871208191, - "B": 0.0011187400668859482, - "C": 0.0003205243847332895, - "D": 0.00017156434478238225 + "A": 0.8663285374641418, + "B": 0.0001655907544773072, + "C": 0.00019974057795479894, + "D": 3.6948291381122544e-05 }, "sample": { "messages": [ @@ -70747,10 +70747,10 @@ ] }, "predict": { - "A": 0.0024194519501179457, - "B": 0.9760765433311462, - "C": 0.00015467038610950112, - "D": 0.000371034984709695 + "A": 0.003833022667095065, + "B": 0.9379096627235413, + "C": 0.00012321231770329177, + "D": 0.00027766323182731867 }, "sample": { "messages": [ @@ -70792,10 +70792,10 @@ ] }, "predict": { - "A": 0.006551824044436216, - "B": 0.000609414535574615, - "C": 0.0003261961101088673, - "D": 0.9723768830299377 + "A": 0.015121432952582836, + "B": 0.0002026273141382262, + "C": 0.00010845859651453793, + "D": 0.9355299472808838 }, "sample": { "messages": [ @@ -70837,10 +70837,10 @@ ] }, "predict": { - "A": 0.0031068152748048306, - "B": 0.976132333278656, - "C": 0.0008901174878701568, - "D": 0.002135280752554536 + "A": 0.01695437915623188, + "B": 0.9256777167320251, + "C": 0.0022945257369428873, + "D": 0.0037830332294106483 }, "sample": { "messages": [ @@ -70882,10 +70882,10 @@ ] }, "predict": { - "A": 0.007373785600066185, - "B": 0.9657753705978394, - "C": 0.00047139040543697774, - "D": 0.0065073431469500065 + "A": 0.010352124460041523, + "B": 0.9318684339523315, + "C": 0.000548642419744283, + "D": 0.02191544510424137 }, "sample": { "messages": [ @@ -70927,10 +70927,10 @@ ] }, "predict": { - "A": 0.962605357170105, - "B": 0.007349581923335791, - "C": 0.0012771659530699253, - "D": 0.00025148887652903795 + "A": 0.92973393201828, + "B": 0.0012335528153926134, + "C": 0.0009606918320059776, + "D": 0.00013001551269553602 }, "sample": { "messages": [ @@ -70972,10 +70972,10 @@ ] }, "predict": { - "A": 0.45566385984420776, - "B": 0.008345774374902248, - "C": 0.5163347721099854, - "D": 0.0008796382462605834 + "A": 0.19494685530662537, + "B": 0.0013135416666045785, + "C": 0.7710297703742981, + "D": 0.00027533326647244394 }, "sample": { "messages": [ @@ -71017,10 +71017,10 @@ ] }, "predict": { - "A": 0.010822751559317112, - "B": 0.005112305749207735, - "C": 0.9742330312728882, - "D": 0.00039421868859790266 + "A": 0.0035716008860617876, + "B": 0.0005477227387018502, + "C": 0.9903059005737305, + "D": 0.00027541228337213397 }, "sample": { "messages": [ @@ -71062,10 +71062,10 @@ ] }, "predict": { - "A": 0.005137749016284943, - "B": 0.1031944528222084, - "C": 0.004534048028290272, - "D": 0.8640365600585938 + "A": 0.0024736805353313684, + "B": 0.010414584539830685, + "C": 0.0013240657281130552, + "D": 0.9374909400939941 }, "sample": { "messages": [ @@ -71107,10 +71107,10 @@ ] }, "predict": { - "A": 0.8848859071731567, - "B": 0.01430284883826971, - "C": 0.04405587539076805, - "D": 0.0031913970597088337 + "A": 0.8266063332557678, + "B": 0.0019248125609010458, + "C": 0.024961350485682487, + "D": 0.001499045523814857 }, "sample": { "messages": [ @@ -71152,10 +71152,10 @@ ] }, "predict": { - "A": 0.15329359471797943, - "B": 0.02663843147456646, - "C": 0.01615702547132969, - "D": 0.7784891128540039 + "A": 0.009462833404541016, + "B": 0.004199106711894274, + "C": 0.0015447650803253055, + "D": 0.96523517370224 }, "sample": { "messages": [ @@ -71197,10 +71197,10 @@ ] }, "predict": { - "A": 3.966095391660929e-05, - "B": 3.5000673960894346e-05, - "C": 1.2876027540187351e-05, - "D": 0.98990797996521 + "A": 2.7323612812324427e-05, + "B": 3.0961706215748563e-05, + "C": 1.1390176041459199e-05, + "D": 0.9321520924568176 }, "sample": { "messages": [ @@ -71242,10 +71242,10 @@ ] }, "predict": { - "A": 0.9388870000839233, - "B": 0.013392503373324871, - "C": 0.011818842962384224, - "D": 0.001099323621019721 + "A": 0.901260495185852, + "B": 0.0010552675230428576, + "C": 0.0019714992959052324, + "D": 0.0001341622119070962 }, "sample": { "messages": [ @@ -71287,10 +71287,10 @@ ] }, "predict": { - "A": 0.9631896018981934, - "B": 0.0023875085171312094, - "C": 8.16960382508114e-05, - "D": 0.0006036565755493939 + "A": 0.8660954236984253, + "B": 0.0006150787812657654, + "C": 4.455615999177098e-05, + "D": 4.1856637835735455e-05 }, "sample": { "messages": [ @@ -71332,10 +71332,10 @@ ] }, "predict": { - "A": 0.014516288414597511, - "B": 0.0036702926736325026, - "C": 0.7925624847412109, - "D": 0.17684459686279297 + "A": 0.017608975991606712, + "B": 0.0011257041478529572, + "C": 0.8484479784965515, + "D": 0.11482495069503784 }, "sample": { "messages": [ @@ -71377,10 +71377,10 @@ ] }, "predict": { - "A": 0.005823465529829264, - "B": 0.005823465529829264, - "C": 0.9793563485145569, - "D": 0.001890602521598339 + "A": 0.002774290507659316, + "B": 0.0007466901442967355, + "C": 0.9877157807350159, + "D": 0.0003313417255412787 }, "sample": { "messages": [ @@ -71422,10 +71422,10 @@ ] }, "predict": { - "A": 0.0014755470911040902, - "B": 0.0074934461154043674, - "C": 0.9814478158950806, - "D": 0.00010041223868029192 + "A": 0.0006865304312668741, + "B": 0.0016469010151922703, + "C": 0.9667068123817444, + "D": 2.6619674827088602e-05 }, "sample": { "messages": [ @@ -71467,10 +71467,10 @@ ] }, "predict": { - "A": 0.8071644902229309, - "B": 0.15894010663032532, - "C": 0.006983341183513403, - "D": 0.007913162931799889 + "A": 0.9223440885543823, + "B": 0.0278523787856102, + "C": 0.003124950220808387, + "D": 0.004271300975233316 }, "sample": { "messages": [ @@ -71512,10 +71512,10 @@ ] }, "predict": { - "A": 0.0176433976739645, - "B": 0.9632968902587891, - "C": 0.000775196822360158, - "D": 0.0012780834222212434 + "A": 0.007975985296070576, + "B": 0.9218984842300415, + "C": 0.0005777783808298409, + "D": 0.00047899511992000043 }, "sample": { "messages": [ @@ -71557,10 +71557,10 @@ ] }, "predict": { - "A": 0.0012981850886717439, - "B": 0.00047757563879713416, - "C": 3.2499425287824124e-05, - "D": 0.9784476161003113 + "A": 0.0014363850932568312, + "B": 0.00020693089754786342, + "C": 1.5956797142280266e-05, + "D": 0.9553995728492737 }, "sample": { "messages": [ @@ -71602,10 +71602,10 @@ ] }, "predict": { - "A": 0.007465295493602753, - "B": 0.9777607917785645, - "C": 0.0011448401492089033, - "D": 0.0012972739059478045 + "A": 0.002647106535732746, + "B": 0.9424352049827576, + "C": 0.0006287429132498801, + "D": 0.0004321282613091171 }, "sample": { "messages": [ @@ -71647,10 +71647,10 @@ ] }, "predict": { - "A": 0.6940966844558716, - "B": 0.023750681430101395, - "C": 0.22534018754959106, - "D": 0.020959902554750443 + "A": 0.8763294219970703, + "B": 0.001589217921718955, + "C": 0.016050532460212708, + "D": 0.0005846405983902514 }, "sample": { "messages": [ @@ -71692,10 +71692,10 @@ ] }, "predict": { - "A": 0.00010729102359618992, - "B": 0.9851458072662354, - "C": 8.355832687811926e-05, - "D": 0.00012157664605183527 + "A": 0.00014415112673304975, + "B": 0.9096928238868713, + "C": 3.644711250672117e-05, + "D": 0.0001054632302839309 }, "sample": { "messages": [ @@ -71737,10 +71737,10 @@ ] }, "predict": { - "A": 0.9359966516494751, - "B": 0.01942599005997181, - "C": 0.0023201038129627705, - "D": 0.0023201038129627705 + "A": 0.8505878448486328, + "B": 0.0005674672429449856, + "C": 0.0001347852812614292, + "D": 0.00015273172175511718 }, "sample": { "messages": [ @@ -71782,10 +71782,10 @@ ] }, "predict": { - "A": 0.9350873827934265, - "B": 0.005560232326388359, - "C": 0.0015930335503071547, - "D": 0.0012406556634232402 + "A": 0.8635590076446533, + "B": 0.0010763355530798435, + "C": 0.001145753194577992, + "D": 0.0007397538865916431 }, "sample": { "messages": [ @@ -71827,10 +71827,10 @@ ] }, "predict": { - "A": 0.941908597946167, - "B": 0.002997888717800379, - "C": 0.00013171804312150925, - "D": 0.0002788470883388072 + "A": 0.9237815141677856, + "B": 0.00015582458581775427, + "C": 1.361579961667303e-05, + "D": 3.0683677323395386e-05 }, "sample": { "messages": [ @@ -71872,10 +71872,10 @@ ] }, "predict": { - "A": 0.0014841826632618904, - "B": 0.001905728247947991, - "C": 0.9871916770935059, - "D": 0.000701078271958977 + "A": 0.0006595653831027448, + "B": 0.0004258474218659103, + "C": 0.9886356592178345, + "D": 0.000258289510384202 }, "sample": { "messages": [ @@ -71917,10 +71917,10 @@ ] }, "predict": { - "A": 0.9503322243690491, - "B": 0.013555760495364666, - "C": 0.005650881677865982, - "D": 0.0003612487926147878 + "A": 0.8482314944267273, + "B": 0.0016374719562008977, + "C": 0.0021025557070970535, + "D": 9.833784133661538e-05 }, "sample": { "messages": [ @@ -71962,10 +71962,10 @@ ] }, "predict": { - "A": 0.35259130597114563, - "B": 0.2745983898639679, - "C": 0.31116074323654175, - "D": 0.02894245833158493 + "A": 0.5877342224121094, + "B": 0.27762600779533386, + "C": 0.054667796939611435, + "D": 0.00892427284270525 }, "sample": { "messages": [ @@ -72007,10 +72007,10 @@ ] }, "predict": { - "A": 0.3884584903717041, - "B": 0.09821769595146179, - "C": 0.440181165933609, - "D": 0.05257214233279228 + "A": 0.21684832870960236, + "B": 0.0032926679123193026, + "C": 0.7568750381469727, + "D": 0.0019971041474491358 }, "sample": { "messages": [ @@ -72052,10 +72052,10 @@ ] }, "predict": { - "A": 0.07797737419605255, - "B": 0.8383358120918274, - "C": 0.03250580653548241, - "D": 0.028686273843050003 + "A": 0.08665469288825989, + "B": 0.8221568465232849, + "C": 0.028132660314440727, + "D": 0.011727437376976013 }, "sample": { "messages": [ @@ -72097,10 +72097,10 @@ ] }, "predict": { - "A": 0.9511400461196899, - "B": 0.0034303467255085707, - "C": 0.0003615558671299368, - "D": 0.0012619539629667997 + "A": 0.8351991772651672, + "B": 0.00016993662575259805, + "C": 1.1564332453417592e-05, + "D": 4.0363487642025575e-05 }, "sample": { "messages": [ @@ -72135,17 +72135,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "A" + "D" ] }, "predict": { - "A": 0.37295323610305786, - "B": 0.29045626521110535, - "C": 0.22620756924152374, - "D": 0.08321711421012878 + "A": 0.2149091511964798, + "B": 0.013738682493567467, + "C": 0.0047479611821472645, + "D": 0.7501066327095032 }, "sample": { "messages": [ @@ -72175,7 +72175,7 @@ "prompt_len": 79, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -72187,10 +72187,10 @@ ] }, "predict": { - "A": 0.007463078945875168, - "B": 0.00958278402686119, - "C": 0.9774705171585083, - "D": 0.0008913377532735467 + "A": 0.0040245624259114265, + "B": 0.004560426808893681, + "C": 0.9847779273986816, + "D": 0.0001882312644738704 }, "sample": { "messages": [ @@ -72232,10 +72232,10 @@ ] }, "predict": { - "A": 0.00013411170220933855, - "B": 0.9590255618095398, - "C": 0.0001519684592494741, - "D": 0.013679764233529568 + "A": 0.00019790371879935265, + "B": 0.9137211441993713, + "C": 0.0005053649074397981, + "D": 0.014768926426768303 }, "sample": { "messages": [ @@ -72277,10 +72277,10 @@ ] }, "predict": { - "A": 0.31605246663093567, - "B": 0.0427730493247509, - "C": 0.020204557105898857, - "D": 0.5904636979103088 + "A": 0.2977246940135956, + "B": 0.019032903015613556, + "C": 0.006577595137059689, + "D": 0.6302831768989563 }, "sample": { "messages": [ @@ -72322,10 +72322,10 @@ ] }, "predict": { - "A": 0.9527888298034668, - "B": 0.003436292987316847, - "C": 0.00236172741279006, - "D": 0.007274631876498461 + "A": 0.933957040309906, + "B": 0.0007060497882775962, + "C": 0.0007515861070714891, + "D": 0.01710602082312107 }, "sample": { "messages": [ @@ -72367,10 +72367,10 @@ ] }, "predict": { - "A": 0.0003733703924808651, - "B": 0.00032949820160865784, - "C": 0.00013735529500991106, - "D": 0.9822202920913696 + "A": 0.00016728363698348403, + "B": 4.792756590177305e-05, + "C": 6.550921534653753e-05, + "D": 0.9316297769546509 }, "sample": { "messages": [ @@ -72408,14 +72408,14 @@ "acc": false, "f1_macro": [ "D", - "B" + "A" ] }, "predict": { - "A": 0.0737781971693039, - "B": 0.8988024592399597, - "C": 0.0022279086988419294, - "D": 0.007776164915412664 + "A": 0.7939362525939941, + "B": 0.15633530914783478, + "C": 0.003913777880370617, + "D": 0.014541449956595898 }, "sample": { "messages": [ @@ -72445,7 +72445,7 @@ "prompt_len": 83, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -72457,10 +72457,10 @@ ] }, "predict": { - "A": 0.9256114363670349, - "B": 0.02466670796275139, - "C": 0.0025998521596193314, - "D": 0.011651728302240372 + "A": 0.8636306524276733, + "B": 0.0013821569737046957, + "C": 0.0010764249600470066, + "D": 0.0006528846570290625 }, "sample": { "messages": [ @@ -72502,10 +72502,10 @@ ] }, "predict": { - "A": 0.9394205212593079, - "B": 0.002638638950884342, - "C": 0.013400113210082054, - "D": 0.0010999483056366444 + "A": 0.8886917233467102, + "B": 0.0005928881000727415, + "C": 0.0036318798083812, + "D": 0.00038279732689261436 }, "sample": { "messages": [ @@ -72547,10 +72547,10 @@ ] }, "predict": { - "A": 0.9580726027488708, - "B": 0.008288952521979809, - "C": 0.0016321915900334716, - "D": 0.0008736491436138749 + "A": 0.9117279052734375, + "B": 0.0017600489081814885, + "C": 0.0006474858382716775, + "D": 0.00041804826469160616 }, "sample": { "messages": [ @@ -72592,10 +72592,10 @@ ] }, "predict": { - "A": 0.06592882424592972, - "B": 0.008922495879232883, - "C": 0.9101192951202393, - "D": 0.00019712360517587513 + "A": 0.010813615284860134, + "B": 0.0006912912358529866, + "C": 0.9734105467796326, + "D": 4.419277320266701e-05 }, "sample": { "messages": [ @@ -72637,10 +72637,10 @@ ] }, "predict": { - "A": 0.9228927493095398, - "B": 0.0015722585376352072, - "C": 0.021704358980059624, - "D": 0.00021278203348629177 + "A": 0.768115222454071, + "B": 0.0001468181872041896, + "C": 0.01095657516270876, + "D": 5.073900683782995e-05 }, "sample": { "messages": [ @@ -72682,10 +72682,10 @@ ] }, "predict": { - "A": 0.03239607810974121, - "B": 0.003869159845635295, - "C": 0.9467521905899048, - "D": 0.004968099761754274 + "A": 0.015570202842354774, + "B": 0.0006841069553047419, + "C": 0.9632943272590637, + "D": 0.000775194785092026 }, "sample": { "messages": [ @@ -72727,10 +72727,10 @@ ] }, "predict": { - "A": 0.08921008557081223, - "B": 0.8464017510414124, - "C": 0.007322809658944607, - "D": 0.025559118017554283 + "A": 0.13332146406173706, + "B": 0.7672120928764343, + "C": 0.007521483115851879, + "D": 0.010943692177534103 }, "sample": { "messages": [ @@ -72772,10 +72772,10 @@ ] }, "predict": { - "A": 0.0013031099224463105, - "B": 0.982159435749054, - "C": 0.00025659753009676933, - "D": 0.00013734678213950247 + "A": 0.0022538115736097097, + "B": 0.9092524647712708, + "C": 0.00020963713177479804, + "D": 9.902557212626562e-05 }, "sample": { "messages": [ @@ -72817,10 +72817,10 @@ ] }, "predict": { - "A": 0.00035121626569889486, - "B": 0.3398984670639038, - "C": 0.0001071149090421386, - "D": 0.6350138783454895 + "A": 0.0003197322366759181, + "B": 0.2730700969696045, + "C": 8.605476614320651e-05, + "D": 0.6550611257553101 }, "sample": { "messages": [ @@ -72862,10 +72862,10 @@ ] }, "predict": { - "A": 0.953292191028595, - "B": 0.0007671457133255899, - "C": 0.015408532693982124, - "D": 0.0007671457133255899 + "A": 0.8827427625656128, + "B": 0.00013988057617098093, + "C": 0.01111209113150835, + "D": 0.00015850545605644584 }, "sample": { "messages": [ @@ -72907,10 +72907,10 @@ ] }, "predict": { - "A": 0.7422834038734436, - "B": 0.12898950278759003, - "C": 0.005667403340339661, - "D": 0.0690431073307991 + "A": 0.7688668966293335, + "B": 0.11790956556797028, + "C": 0.0012304967967793345, + "D": 0.03378165513277054 }, "sample": { "messages": [ @@ -72952,10 +72952,10 @@ ] }, "predict": { - "A": 0.7831448316574097, - "B": 0.13609015941619873, - "C": 0.023648925125598907, - "D": 0.030365819111466408 + "A": 0.6465504765510559, + "B": 0.09915173053741455, + "C": 0.02212374098598957, + "D": 0.14426492154598236 }, "sample": { "messages": [ @@ -72997,10 +72997,10 @@ ] }, "predict": { - "A": 0.028730247169733047, - "B": 0.011976548470556736, - "C": 0.9514151215553284, - "D": 0.0005962772411294281 + "A": 0.036560893058776855, + "B": 0.002062624553218484, + "C": 0.9429179430007935, + "D": 0.0001238702388945967 }, "sample": { "messages": [ @@ -73042,10 +73042,10 @@ ] }, "predict": { - "A": 0.387431800365448, - "B": 0.20737729966640472, - "C": 0.11100106686353683, - "D": 0.20737729966640472 + "A": 0.5834027528762817, + "B": 0.037295687943696976, + "C": 0.02563292719423771, + "D": 0.06967752426862717 }, "sample": { "messages": [ @@ -73087,10 +73087,10 @@ ] }, "predict": { - "A": 0.002157465787604451, - "B": 0.002770241117104888, - "C": 0.9862740635871887, - "D": 0.00042483018478378654 + "A": 0.0011554460506886244, + "B": 0.0007941257790662348, + "C": 0.9868189096450806, + "D": 0.00018862138676922768 }, "sample": { "messages": [ @@ -73132,10 +73132,10 @@ ] }, "predict": { - "A": 0.0004775458946824074, - "B": 0.978386640548706, - "C": 0.0002255766885355115, - "D": 0.0011455729836598039 + "A": 0.0003483228210825473, + "B": 0.9163280129432678, + "C": 0.00017514769569970667, + "D": 0.0013776434352621436 }, "sample": { "messages": [ @@ -73177,10 +73177,10 @@ ] }, "predict": { - "A": 0.1242557093501091, - "B": 0.05179748311638832, - "C": 0.8102489709854126, - "D": 0.00039547868072986603 + "A": 0.0073754191398620605, + "B": 0.015613763593137264, + "C": 0.965989351272583, + "D": 0.00018464001186657697 }, "sample": { "messages": [ @@ -73222,10 +73222,10 @@ ] }, "predict": { - "A": 0.5069368481636047, - "B": 0.44737017154693604, - "C": 0.003415713319554925, - "D": 0.0023475834168493748 + "A": 0.6036592125892639, + "B": 0.19597946107387543, + "C": 0.002045229310169816, + "D": 0.0021771350875496864 }, "sample": { "messages": [ @@ -73267,10 +73267,10 @@ ] }, "predict": { - "A": 0.010044313035905361, - "B": 0.4270950257778168, - "C": 0.00022190775780472904, - "D": 0.54840087890625 + "A": 0.00698445551097393, + "B": 0.3365299105644226, + "C": 0.00010605334682622924, + "D": 0.6287206411361694 }, "sample": { "messages": [ @@ -73312,10 +73312,10 @@ ] }, "predict": { - "A": 0.008486561477184296, - "B": 0.0012226051185280085, - "C": 0.9809131026268005, - "D": 0.00044977126526646316 + "A": 0.003553177462890744, + "B": 0.00012158304161857814, + "C": 0.9851976037025452, + "D": 8.356272155651823e-05 }, "sample": { "messages": [ @@ -73357,10 +73357,10 @@ ] }, "predict": { - "A": 0.9757705926895142, - "B": 0.005802143830806017, - "C": 0.000692967267241329, - "D": 0.0008897876250557601 + "A": 0.8891999125480652, + "B": 0.002830128651112318, + "C": 0.0008108453475870192, + "D": 0.0003830162459053099 }, "sample": { "messages": [ @@ -73402,10 +73402,10 @@ ] }, "predict": { - "A": 0.0016855142312124372, - "B": 0.0013126797275617719, - "C": 0.989372193813324, - "D": 0.000122098223073408 + "A": 0.0007032586145214736, + "B": 0.00011480381363071501, + "C": 0.9902618527412415, + "D": 3.289183950982988e-05 }, "sample": { "messages": [ @@ -73447,10 +73447,10 @@ ] }, "predict": { - "A": 0.06016599014401436, - "B": 0.830565869808197, - "C": 0.0038462833035737276, - "D": 0.07725465297698975 + "A": 0.07564181834459305, + "B": 0.717669665813446, + "C": 0.004267419688403606, + "D": 0.124712273478508 }, "sample": { "messages": [ @@ -73492,10 +73492,10 @@ ] }, "predict": { - "A": 0.9488875269889832, - "B": 0.0038778865709900856, - "C": 0.008209485560655594, - "D": 0.000408726220484823 + "A": 0.9081268310546875, + "B": 0.0005022707628086209, + "C": 0.006118910387158394, + "D": 8.728156535653397e-05 }, "sample": { "messages": [ @@ -73537,10 +73537,10 @@ ] }, "predict": { - "A": 0.019364112988114357, - "B": 0.20818383991718292, - "C": 0.011744928546249866, - "D": 0.7266330122947693 + "A": 0.07746841758489609, + "B": 0.060332462191581726, + "C": 0.04698696732521057, + "D": 0.7349998950958252 }, "sample": { "messages": [ @@ -73582,10 +73582,10 @@ ] }, "predict": { - "A": 0.7999755144119263, - "B": 0.13901488482952118, - "C": 0.024157168343663216, - "D": 0.004756827838718891 + "A": 0.8848044276237488, + "B": 0.014301531948149204, + "C": 0.005261239130049944, + "D": 0.0011739411856979132 }, "sample": { "messages": [ @@ -73627,10 +73627,10 @@ ] }, "predict": { - "A": 0.022166451439261436, - "B": 0.9425414800643921, - "C": 0.002647405257448554, - "D": 0.013444632291793823 + "A": 0.010265176184475422, + "B": 0.9240416288375854, + "C": 0.0006562307244166732, + "D": 0.002290470292791724 }, "sample": { "messages": [ @@ -73672,10 +73672,10 @@ ] }, "predict": { - "A": 5.793436139356345e-05, - "B": 0.0005496658850461245, - "C": 0.9938192963600159, - "D": 0.00025964376982301474 + "A": 2.4001588826649822e-05, + "B": 0.0001565100101288408, + "C": 0.9876859188079834, + "D": 0.0001565100101288408 }, "sample": { "messages": [ @@ -73717,10 +73717,10 @@ ] }, "predict": { - "A": 0.004577879328280687, - "B": 0.001684107817709446, - "C": 0.98854660987854, - "D": 0.000846822455059737 + "A": 0.004578670021146536, + "B": 0.0005468440940603614, + "C": 0.9887173175811768, + "D": 0.00021414720686152577 }, "sample": { "messages": [ @@ -73762,10 +73762,10 @@ ] }, "predict": { - "A": 0.022562088444828987, - "B": 0.9593643546104431, - "C": 0.0023780264891684055, - "D": 0.00028401476447470486 + "A": 0.002593226730823517, + "B": 0.9232526421546936, + "C": 0.004275508224964142, + "D": 0.0001657794346101582 }, "sample": { "messages": [ @@ -73800,17 +73800,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "A" ] }, "predict": { - "A": 0.23795205354690552, - "B": 0.00435824366286397, - "C": 0.7329439520835876, - "D": 0.0003157101455144584 + "A": 0.49795353412628174, + "B": 0.0004833601415157318, + "C": 0.4394424557685852, + "D": 0.0001078522254829295 }, "sample": { "messages": [ @@ -73840,7 +73840,7 @@ "prompt_len": 64, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -73852,10 +73852,10 @@ ] }, "predict": { - "A": 0.11332858353853226, - "B": 0.017379499971866608, - "C": 0.8373912572860718, - "D": 0.0007636019727215171 + "A": 0.06265684217214584, + "B": 0.0018920727306976914, + "C": 0.8649510145187378, + "D": 0.0001370612735627219 }, "sample": { "messages": [ @@ -73897,10 +73897,10 @@ ] }, "predict": { - "A": 0.917026162147522, - "B": 0.016795920208096504, - "C": 0.008990208618342876, - "D": 0.02156638912856579 + "A": 0.9067798852920532, + "B": 0.002247682772576809, + "C": 0.006109834648668766, + "D": 0.00888975802809 }, "sample": { "messages": [ @@ -73942,10 +73942,10 @@ ] }, "predict": { - "A": 0.7134184837341309, - "B": 0.0402483195066452, - "C": 0.008980614133179188, - "D": 0.20439782738685608 + "A": 0.8963989615440369, + "B": 0.006844089832156897, + "C": 0.0036633776035159826, + "D": 0.00775537034496665 }, "sample": { "messages": [ @@ -73987,10 +73987,10 @@ ] }, "predict": { - "A": 0.005747524555772543, - "B": 0.9665850400924683, - "C": 0.001453201868571341, - "D": 0.005747524555772543 + "A": 0.0007820245227776468, + "B": 0.9129040837287903, + "C": 0.00010583551193121821, + "D": 0.0010041393106803298 }, "sample": { "messages": [ @@ -74032,10 +74032,10 @@ ] }, "predict": { - "A": 0.7887941598892212, - "B": 0.15532276034355164, - "C": 0.007733066100627184, - "D": 0.001185904024168849 + "A": 0.8646042346954346, + "B": 0.0007884170045144856, + "C": 0.0014729569666087627, + "D": 5.040180985815823e-05 }, "sample": { "messages": [ @@ -74077,10 +74077,10 @@ ] }, "predict": { - "A": 0.15872465074062347, - "B": 0.014763693325221539, - "C": 0.8060703277587891, - "D": 0.005431259050965309 + "A": 0.3011859357357025, + "B": 0.005516412667930126, + "C": 0.6376106142997742, + "D": 0.002952723065391183 }, "sample": { "messages": [ @@ -74122,10 +74122,10 @@ ] }, "predict": { - "A": 0.9410719275474548, - "B": 0.010454365983605385, - "C": 0.001603228971362114, - "D": 0.0018166962545365095 + "A": 0.8791108131408691, + "B": 0.0007074495078995824, + "C": 0.00029490882297977805, + "D": 0.00037867043283768 }, "sample": { "messages": [ @@ -74167,10 +74167,10 @@ ] }, "predict": { - "A": 0.009585493244230747, - "B": 0.13232363760471344, - "C": 0.7614700198173523, - "D": 0.07082774490118027 + "A": 0.011223465204238892, + "B": 0.03917374089360237, + "C": 0.8915902376174927, + "D": 0.03457070514559746 }, "sample": { "messages": [ @@ -74212,10 +74212,10 @@ ] }, "predict": { - "A": 0.006561357527971268, - "B": 0.006561357527971268, - "C": 0.0045095509849488735, - "D": 0.9737918376922607 + "A": 0.2171623855829239, + "B": 0.0074308887124061584, + "C": 0.07050230354070663, + "D": 0.6689072251319885 }, "sample": { "messages": [ @@ -74257,10 +74257,10 @@ ] }, "predict": { - "A": 0.008467299863696098, - "B": 0.002425922080874443, - "C": 0.9786867499351501, - "D": 0.0005412963218986988 + "A": 0.0021625144872814417, + "B": 0.0003115397703368217, + "C": 0.9885820746421814, + "D": 6.951391696929932e-05 }, "sample": { "messages": [ @@ -74302,10 +74302,10 @@ ] }, "predict": { - "A": 0.0004808501398656517, - "B": 0.9851562976837158, - "C": 0.0005448745796456933, - "D": 0.0004243487201165408 + "A": 0.00027908527408726513, + "B": 0.94271320104599, + "C": 0.0006289283628575504, + "D": 0.0003162450448144227 }, "sample": { "messages": [ @@ -74347,10 +74347,10 @@ ] }, "predict": { - "A": 0.00961992982774973, - "B": 0.0016716932877898216, - "C": 0.9812595248222351, - "D": 0.0002262390626128763 + "A": 0.003544403240084648, + "B": 0.0002567556803114712, + "C": 0.9827647805213928, + "D": 6.491800741059706e-05 }, "sample": { "messages": [ @@ -74392,10 +74392,10 @@ ] }, "predict": { - "A": 0.8248087763786316, - "B": 0.08693420141935349, - "C": 0.010382810607552528, - "D": 0.019397644326090813 + "A": 0.8990929126739502, + "B": 0.016467461362481117, + "C": 0.006864658556878567, + "D": 0.016467461362481117 }, "sample": { "messages": [ @@ -74437,10 +74437,10 @@ ] }, "predict": { - "A": 0.020177030935883522, - "B": 0.001876756432466209, - "C": 0.972183883190155, - "D": 0.00025399134028702974 + "A": 0.009599391371011734, + "B": 0.00018715832266025245, + "C": 0.9791644811630249, + "D": 3.2523239497095346e-05 }, "sample": { "messages": [ @@ -74482,10 +74482,10 @@ ] }, "predict": { - "A": 0.011726145632565022, - "B": 0.04637780413031578, - "C": 0.9315231442451477, - "D": 0.000515210849698633 + "A": 0.01951259933412075, + "B": 0.028390666469931602, + "C": 0.9401697516441345, + "D": 0.0002962831931654364 }, "sample": { "messages": [ @@ -74527,10 +74527,10 @@ ] }, "predict": { - "A": 0.10824846476316452, - "B": 0.7998539805412292, - "C": 0.011409304104745388, - "D": 0.02736949734389782 + "A": 0.27621257305145264, + "B": 0.58474200963974, + "C": 0.005732609424740076, + "D": 0.01213593315333128 }, "sample": { "messages": [ @@ -74572,10 +74572,10 @@ ] }, "predict": { - "A": 0.0009039713768288493, - "B": 0.0016888410318642855, - "C": 0.9913250207901001, - "D": 0.0009622724610380828 + "A": 0.0008496592054143548, + "B": 0.0005153443198651075, + "C": 0.9918580651283264, + "D": 0.00037703398265875876 }, "sample": { "messages": [ @@ -74617,10 +74617,10 @@ ] }, "predict": { - "A": 0.006584749091416597, - "B": 0.0035245621111243963, - "C": 0.0018865622114390135, - "D": 0.9772633910179138 + "A": 0.0003011549706570804, + "B": 0.0007690261118113995, + "C": 0.0005989181227050722, + "D": 0.9556289315223694 }, "sample": { "messages": [ @@ -74662,10 +74662,10 @@ ] }, "predict": { - "A": 0.0806528776884079, - "B": 0.867100179195404, - "C": 0.0011504514841362834, - "D": 0.0014772091526538134 + "A": 0.3054884970188141, + "B": 0.5707275867462158, + "C": 0.0029949035961180925, + "D": 0.005595216993242502 }, "sample": { "messages": [ @@ -74707,10 +74707,10 @@ ] }, "predict": { - "A": 0.17293792963027954, - "B": 0.02652088925242424, - "C": 0.7750540375709534, - "D": 0.008610072545707226 + "A": 0.0368124358355999, + "B": 0.003880002535879612, + "C": 0.9494052529335022, + "D": 0.0008132926886901259 }, "sample": { "messages": [ @@ -74752,10 +74752,10 @@ ] }, "predict": { - "A": 0.9674322605133057, - "B": 0.0030791249591857195, - "C": 0.003953674808144569, - "D": 0.0008821840165182948 + "A": 0.9507496953010559, + "B": 0.0002483914722688496, + "C": 0.0011132133658975363, + "D": 0.00019344748579896986 }, "sample": { "messages": [ @@ -74797,10 +74797,10 @@ ] }, "predict": { - "A": 0.968563437461853, - "B": 0.0034931853879243135, - "C": 0.0005356973269954324, - "D": 0.0003681790258269757 + "A": 0.9315509796142578, + "B": 0.0004271375946700573, + "C": 0.00016726947796996683, + "D": 7.901250501163304e-05 }, "sample": { "messages": [ @@ -74842,10 +74842,10 @@ ] }, "predict": { - "A": 0.00947817787528038, - "B": 0.003951092250645161, - "C": 0.0002688752138055861, - "D": 0.9668003916740417 + "A": 0.001721708569675684, + "B": 0.00043531612027436495, + "C": 0.0001247201580554247, + "D": 0.9493876695632935 }, "sample": { "messages": [ @@ -74887,10 +74887,10 @@ ] }, "predict": { - "A": 0.004053284879773855, - "B": 0.002458441536873579, - "C": 0.10453558713197708, - "D": 0.8752657175064087 + "A": 0.00236300565302372, + "B": 0.000635994307231158, + "C": 0.06487318873405457, + "D": 0.8955466747283936 }, "sample": { "messages": [ @@ -74932,10 +74932,10 @@ ] }, "predict": { - "A": 0.00010691506759030744, - "B": 0.9816938042640686, - "C": 0.00010691506759030744, - "D": 0.00013728166231885552 + "A": 0.0001133038749685511, + "B": 0.9181108474731445, + "C": 0.00027180189499631524, + "D": 0.00039546939660795033 }, "sample": { "messages": [ @@ -74977,10 +74977,10 @@ ] }, "predict": { - "A": 0.00019937974866479635, - "B": 0.9799053072929382, - "C": 2.1014469893998466e-05, - "D": 0.00012092992255929857 + "A": 0.0001744721521390602, + "B": 0.9127936959266663, + "C": 2.848179792636074e-05, + "D": 9.941124153556302e-05 }, "sample": { "messages": [ @@ -75022,10 +75022,10 @@ ] }, "predict": { - "A": 0.008433986455202103, - "B": 0.9748363494873047, - "C": 0.001007296028546989, - "D": 0.0027381149120628834 + "A": 0.01331399753689766, + "B": 0.9333832859992981, + "C": 0.0008511354099027812, + "D": 0.0016926848329603672 }, "sample": { "messages": [ @@ -75067,10 +75067,10 @@ ] }, "predict": { - "A": 0.0031334049999713898, - "B": 0.0002914520737249404, - "C": 0.00013767220661975443, - "D": 0.9844865202903748 + "A": 0.009277627803385258, + "B": 0.00015963039186317474, + "C": 0.0002181888703489676, + "D": 0.9463436603546143 }, "sample": { "messages": [ @@ -75112,10 +75112,10 @@ ] }, "predict": { - "A": 0.5014572143554688, - "B": 0.3446461856365204, - "C": 0.011793144047260284, - "D": 0.12678824365139008 + "A": 0.7916776537895203, + "B": 0.04466339945793152, + "C": 0.023906594142317772, + "D": 0.0945524200797081 }, "sample": { "messages": [ @@ -75157,10 +75157,10 @@ ] }, "predict": { - "A": 0.002037892583757639, - "B": 0.002037892583757639, - "C": 0.028132239356637, - "D": 0.9316117763519287 + "A": 0.0019812399987131357, + "B": 0.0006432143854908645, + "C": 0.21512578427791595, + "D": 0.6626340746879578 }, "sample": { "messages": [ @@ -75202,10 +75202,10 @@ ] }, "predict": { - "A": 0.0045180548913776875, - "B": 0.975628137588501, - "C": 0.0010081143118441105, - "D": 0.007449012249708176 + "A": 0.006273243110626936, + "B": 0.9310318231582642, + "C": 0.0012352748308330774, + "D": 0.013280455023050308 }, "sample": { "messages": [ @@ -75247,10 +75247,10 @@ ] }, "predict": { - "A": 0.0031317700631916523, - "B": 0.9839729070663452, - "C": 0.0003300861280877143, - "D": 0.002439025091007352 + "A": 0.0020760244224220514, + "B": 0.9490435123443604, + "C": 0.0002479457180015743, + "D": 0.001259172335267067 }, "sample": { "messages": [ @@ -75292,10 +75292,10 @@ ] }, "predict": { - "A": 0.0064721424132585526, - "B": 0.0009925351478159428, - "C": 0.009416911751031876, - "D": 0.9605510830879211 + "A": 0.0037853040266782045, + "B": 8.902181434677914e-05, + "C": 0.0015779496170580387, + "D": 0.9262332916259766 }, "sample": { "messages": [ @@ -75337,10 +75337,10 @@ ] }, "predict": { - "A": 0.005783712491393089, - "B": 0.001877696719020605, - "C": 0.0002541185822337866, - "D": 0.9726709723472595 + "A": 0.0021635654848068953, + "B": 0.0003116911684628576, + "C": 0.00010771759116323665, + "D": 0.9291383028030396 }, "sample": { "messages": [ @@ -75382,10 +75382,10 @@ ] }, "predict": { - "A": 0.049085721373558044, - "B": 0.023186450824141502, - "C": 0.043317992240190506, - "D": 0.8700651526451111 + "A": 0.02193375676870346, + "B": 0.0038115154020488262, + "C": 0.01330349501222372, + "D": 0.9326470494270325 }, "sample": { "messages": [ @@ -75427,10 +75427,10 @@ ] }, "predict": { - "A": 0.8198589086532593, - "B": 0.08641248941421509, - "C": 0.004302224610000849, - "D": 0.06729812175035477 + "A": 0.6882582306861877, + "B": 0.19718928635120392, + "C": 0.001933175721205771, + "D": 0.020783597603440285 }, "sample": { "messages": [ @@ -75472,10 +75472,10 @@ ] }, "predict": { - "A": 0.9692100286483765, - "B": 0.00838530994951725, - "C": 0.0003684248076751828, - "D": 0.0005360548966564238 + "A": 0.8896236419677734, + "B": 0.0032084838021546602, + "C": 9.101767500396818e-05, + "D": 0.00015006278408691287 }, "sample": { "messages": [ @@ -75517,10 +75517,10 @@ ] }, "predict": { - "A": 0.0021316353231668472, - "B": 0.9744657874107361, - "C": 0.0001129725351347588, - "D": 0.0003704226983245462 + "A": 0.0018170352559536695, + "B": 0.9412475228309631, + "C": 7.499799539800733e-05, + "D": 0.00043158369953744113 }, "sample": { "messages": [ @@ -75562,10 +75562,10 @@ ] }, "predict": { - "A": 0.008163798600435257, - "B": 0.9436067938804626, - "C": 0.001607547397725284, - "D": 0.01728276163339615 + "A": 0.011489195749163628, + "B": 0.9126998782157898, + "C": 0.000943090592045337, + "D": 0.021464643999934196 }, "sample": { "messages": [ @@ -75607,10 +75607,10 @@ ] }, "predict": { - "A": 0.0006230304716154933, - "B": 0.000705986050888896, - "C": 0.994102418422699, - "D": 0.00021531357197090983 + "A": 0.00014766321692150086, + "B": 0.00014766321692150086, + "C": 0.9919561147689819, + "D": 7.903842197265476e-05 }, "sample": { "messages": [ @@ -75652,10 +75652,10 @@ ] }, "predict": { - "A": 0.007122420240193605, - "B": 0.01605062000453472, - "C": 0.8763341903686523, - "D": 0.05602216720581055 + "A": 0.0047476524487137794, + "B": 0.0015413371147587895, + "C": 0.9630932807922363, + "D": 0.01763966865837574 }, "sample": { "messages": [ @@ -75697,10 +75697,10 @@ ] }, "predict": { - "A": 0.0024451869539916515, - "B": 0.9864588379859924, - "C": 0.00012173868890386075, - "D": 0.00015631556743755937 + "A": 0.0010928966803476214, + "B": 0.9333980679512024, + "C": 0.00016760114522185177, + "D": 0.0002152041270164773 }, "sample": { "messages": [ @@ -75742,10 +75742,10 @@ ] }, "predict": { - "A": 0.0689031183719635, - "B": 0.04735637456178665, - "C": 0.022369565442204475, - "D": 0.8394117951393127 + "A": 0.1479646861553192, + "B": 0.03741133213043213, + "C": 0.022691121324896812, + "D": 0.7514267563819885 }, "sample": { "messages": [ @@ -75787,10 +75787,10 @@ ] }, "predict": { - "A": 0.0014617611886933446, - "B": 0.9722782969474792, - "C": 0.001876938622444868, - "D": 0.0014617611886933446 + "A": 0.0036960141733288765, + "B": 0.9043848514556885, + "C": 0.0028784587047994137, + "D": 0.0009344986756332219 }, "sample": { "messages": [ @@ -75832,10 +75832,10 @@ ] }, "predict": { - "A": 0.015229986049234867, - "B": 0.010467407293617725, - "C": 0.015229986049234867, - "D": 0.9422459006309509 + "A": 0.0023220658767968416, + "B": 0.0014084040885791183, + "C": 0.005570346023887396, + "D": 0.9367882609367371 }, "sample": { "messages": [ @@ -75877,10 +75877,10 @@ ] }, "predict": { - "A": 0.9536406993865967, - "B": 0.025413664057850838, - "C": 0.00043726625153794885, - "D": 0.000767426157835871 + "A": 0.9588190317153931, + "B": 0.0030517110135406256, + "C": 0.00017216573178302497, + "D": 0.0001519357319921255 }, "sample": { "messages": [ @@ -75922,10 +75922,10 @@ ] }, "predict": { - "A": 0.003933046478778124, - "B": 0.9623847007751465, - "C": 0.0006834609666839242, - "D": 0.009434888139367104 + "A": 0.004745544400066137, + "B": 0.9043406844139099, + "C": 0.0006836605025455356, + "D": 0.006904725916683674 }, "sample": { "messages": [ @@ -75967,10 +75967,10 @@ ] }, "predict": { - "A": 0.9261531233787537, - "B": 0.03169121593236923, - "C": 0.005507107824087143, - "D": 0.0026013737078756094 + "A": 0.8855477571487427, + "B": 0.0041009001433849335, + "C": 0.0024873216170817614, + "D": 0.0013313671806827188 }, "sample": { "messages": [ @@ -76012,10 +76012,10 @@ ] }, "predict": { - "A": 0.9356511235237122, - "B": 0.01713704876601696, - "C": 0.0010955347679555416, - "D": 0.0020467285066843033 + "A": 0.9221774339675903, + "B": 0.0010143393883481622, + "C": 0.00030935605172999203, + "D": 0.0002564651658758521 }, "sample": { "messages": [ @@ -76057,10 +76057,10 @@ ] }, "predict": { - "A": 0.00048303205403499305, - "B": 0.0009024225873872638, - "C": 0.9896265268325806, - "D": 0.00022816815180703998 + "A": 0.00014621279842685908, + "B": 0.0001877409522421658, + "C": 0.9822126626968384, + "D": 0.00010049049160443246 }, "sample": { "messages": [ @@ -76102,10 +76102,10 @@ ] }, "predict": { - "A": 0.9642534852027893, - "B": 0.01071188971400261, - "C": 0.0003234710020478815, - "D": 0.0009963607881218195 + "A": 0.9229714870452881, + "B": 0.0008416410419158638, + "C": 7.354177068918943e-05, + "D": 9.442950249649584e-05 }, "sample": { "messages": [ @@ -76147,10 +76147,10 @@ ] }, "predict": { - "A": 0.0016799248987808824, - "B": 0.9860913157463074, - "C": 0.00022735308448318392, - "D": 0.00022735308448318392 + "A": 0.0007501236977986991, + "B": 0.9321398735046387, + "C": 0.0001896609755931422, + "D": 0.0003126980736851692 }, "sample": { "messages": [ @@ -76192,10 +76192,10 @@ ] }, "predict": { - "A": 0.37125104665756226, - "B": 0.07310366630554199, - "C": 0.4766957759857178, - "D": 0.023733284324407578 + "A": 0.4278937876224518, + "B": 0.01375462394207716, + "C": 0.4848672151565552, + "D": 0.014641720801591873 }, "sample": { "messages": [ @@ -76230,17 +76230,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.13016484677791595, - "B": 0.8487813472747803, - "C": 0.0012760910904034972, - "D": 0.0004142861580476165 + "A": 0.4737783670425415, + "B": 0.4737783670425415, + "C": 0.0006285998388193548, + "D": 0.00026203939341939986 }, "sample": { "messages": [ @@ -76270,7 +76270,7 @@ "prompt_len": 64, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -76282,10 +76282,10 @@ ] }, "predict": { - "A": 0.00512639619410038, - "B": 0.9769182205200195, - "C": 9.389322076458484e-05, - "D": 4.7212481149472296e-05 + "A": 0.00623587379232049, + "B": 0.9254857301712036, + "C": 9.468673670198768e-05, + "D": 4.7611483751097694e-05 }, "sample": { "messages": [ @@ -76327,10 +76327,10 @@ ] }, "predict": { - "A": 0.017637912184000015, - "B": 0.0073525747284293175, - "C": 0.962997317314148, - "D": 0.0004148039151914418 + "A": 0.01231626607477665, + "B": 0.0012981249019503593, + "C": 0.9784021973609924, + "D": 0.00014564556477125734 }, "sample": { "messages": [ @@ -76372,10 +76372,10 @@ ] }, "predict": { - "A": 0.947252631187439, - "B": 0.009286538697779179, - "C": 0.0014241366880014539, - "D": 0.006382538005709648 + "A": 0.888450562953949, + "B": 0.0012548035010695457, + "C": 0.000977241899818182, + "D": 0.0024954748805612326 }, "sample": { "messages": [ @@ -76410,17 +76410,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "C" + "D" ] }, "predict": { - "A": 0.003766492707654834, - "B": 0.004836272448301315, - "C": 0.6334267258644104, - "D": 0.3390488922595978 + "A": 0.000700521981343627, + "B": 0.0002577076375018805, + "C": 0.055649351328611374, + "D": 0.8705023527145386 }, "sample": { "messages": [ @@ -76450,7 +76450,7 @@ "prompt_len": 105, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " D" } } { @@ -76462,10 +76462,10 @@ ] }, "predict": { - "A": 0.017741236835718155, - "B": 0.002118888543918729, - "C": 0.0004172339104115963, - "D": 0.9686387181282043 + "A": 0.01994207687675953, + "B": 0.0005657132132910192, + "C": 0.000208114244742319, + "D": 0.9608631134033203 }, "sample": { "messages": [ @@ -76507,10 +76507,10 @@ ] }, "predict": { - "A": 0.0011471204925328493, - "B": 0.9797083735466003, - "C": 6.47161141387187e-05, - "D": 0.0006957637961022556 + "A": 0.00030524696921929717, + "B": 0.9099283814430237, + "C": 3.21727929986082e-05, + "D": 0.0010654166107997298 }, "sample": { "messages": [ @@ -76552,10 +76552,10 @@ ] }, "predict": { - "A": 0.939965546131134, - "B": 0.005589239299297333, - "C": 0.0033900451380759478, - "D": 0.0033900451380759478 + "A": 0.8480736613273621, + "B": 0.0007733430829830468, + "C": 0.001855153706856072, + "D": 0.0009929921943694353 }, "sample": { "messages": [ @@ -76597,10 +76597,10 @@ ] }, "predict": { - "A": 0.9192292094230652, - "B": 0.05185936391353607, - "C": 0.00901180598884821, - "D": 0.00901180598884821 + "A": 0.9455474019050598, + "B": 0.010504083707928658, + "C": 0.0034101763740181923, + "D": 0.0030094701796770096 }, "sample": { "messages": [ @@ -76642,10 +76642,10 @@ ] }, "predict": { - "A": 0.1480972021818161, - "B": 0.45617151260375977, - "C": 0.3552667200565338, - "D": 0.003482912201434374 + "A": 0.05680220574140549, + "B": 0.4755990207195282, + "C": 0.3703968822956085, + "D": 0.0007611499750055373 }, "sample": { "messages": [ @@ -76687,10 +76687,10 @@ ] }, "predict": { - "A": 0.8027955293655396, - "B": 0.07467161118984222, - "C": 0.07467161118984222, - "D": 0.014703713357448578 + "A": 0.8641047477722168, + "B": 0.009599337354302406, + "C": 0.029568038880825043, + "D": 0.00400159927085042 }, "sample": { "messages": [ @@ -76732,10 +76732,10 @@ ] }, "predict": { - "A": 0.6366358399391174, - "B": 0.05225825309753418, - "C": 0.23420526087284088, - "D": 0.04611774906516075 + "A": 0.5217368006706238, + "B": 0.008433089591562748, + "C": 0.40632903575897217, + "D": 0.003302445402368903 }, "sample": { "messages": [ @@ -76777,10 +76777,10 @@ ] }, "predict": { - "A": 0.7809478044509888, - "B": 0.009830682538449764, - "C": 0.014303558506071568, - "D": 0.15377773344516754 + "A": 0.9242076873779297, + "B": 0.0011519277468323708, + "C": 0.0025959094054996967, + "D": 0.003777025733143091 }, "sample": { "messages": [ @@ -76822,10 +76822,10 @@ ] }, "predict": { - "A": 0.0030873557552695274, - "B": 0.003964243456721306, - "C": 0.9700183272361755, - "D": 0.001287001301534474 + "A": 0.0006984930369071662, + "B": 0.0006984930369071662, + "C": 0.9835514426231384, + "D": 0.0008968828478828073 }, "sample": { "messages": [ @@ -76867,10 +76867,10 @@ ] }, "predict": { - "A": 0.962648868560791, - "B": 0.001858349540270865, - "C": 0.0006836485699750483, - "D": 0.0006033176905475557 + "A": 0.8793919682502747, + "B": 0.00037879153387621045, + "C": 0.00027712981682270765, + "D": 0.00011552489741006866 }, "sample": { "messages": [ @@ -76912,10 +76912,10 @@ ] }, "predict": { - "A": 0.7790537476539612, - "B": 0.17383040487766266, - "C": 0.00360773503780365, - "D": 0.0031838146969676018 + "A": 0.854781448841095, + "B": 0.0375564768910408, + "C": 0.0016501164063811302, + "D": 0.001134107238613069 }, "sample": { "messages": [ @@ -76957,10 +76957,10 @@ ] }, "predict": { - "A": 0.165000319480896, - "B": 0.11340294778347015, - "C": 0.025303618982434273, - "D": 0.652588963508606 + "A": 0.2598355710506439, + "B": 0.12273763120174408, + "C": 0.013770781457424164, + "D": 0.5500718951225281 }, "sample": { "messages": [ @@ -77002,10 +77002,10 @@ ] }, "predict": { - "A": 0.0034146469552069902, - "B": 0.9467869400978088, - "C": 0.0005574257811531425, - "D": 0.00043412361992523074 + "A": 0.0017941653495654464, + "B": 0.8730911016464233, + "C": 0.00021428251056931913, + "D": 0.0003318872186355293 }, "sample": { "messages": [ @@ -77047,10 +77047,10 @@ ] }, "predict": { - "A": 0.10064379870891571, - "B": 0.037024784833192825, - "C": 0.0011180515866726637, - "D": 0.8426801562309265 + "A": 0.040321994572877884, + "B": 0.006582383997738361, + "C": 0.00028920979821123183, + "D": 0.9177243113517761 }, "sample": { "messages": [ @@ -77092,10 +77092,10 @@ ] }, "predict": { - "A": 0.019578689709305763, - "B": 0.0603066049516201, - "C": 0.6483571529388428, - "D": 0.23851728439331055 + "A": 0.008542848750948906, + "B": 0.043384164571762085, + "C": 0.7690026760101318, + "D": 0.15142560005187988 }, "sample": { "messages": [ @@ -77137,10 +77137,10 @@ ] }, "predict": { - "A": 0.001897227019071579, - "B": 0.9827879071235657, - "C": 0.00010703422594815493, - "D": 0.0006979508325457573 + "A": 0.00177946372423321, + "B": 0.9217850565910339, + "C": 0.00010686511086532846, + "D": 0.000952478323597461 }, "sample": { "messages": [ @@ -77182,10 +77182,10 @@ ] }, "predict": { - "A": 0.00848326925188303, - "B": 0.9805325269699097, - "C": 0.0005423172260634601, - "D": 0.0003727288276422769 + "A": 0.002060005208477378, + "B": 0.9417204856872559, + "C": 9.634772140998393e-05, + "D": 0.0001492262672400102 }, "sample": { "messages": [ @@ -77227,10 +77227,10 @@ ] }, "predict": { - "A": 0.7634467482566833, - "B": 0.17034800350666046, - "C": 0.020345166325569153, - "D": 0.005828987807035446 + "A": 0.898110032081604, + "B": 0.008804760873317719, + "C": 0.00777017418295145, + "D": 0.0005287670064717531 }, "sample": { "messages": [ @@ -77272,10 +77272,10 @@ ] }, "predict": { - "A": 0.05304252728819847, - "B": 0.04130956158041954, - "C": 0.8297247886657715, - "D": 0.03645556420087814 + "A": 0.105075404047966, + "B": 0.038655079901218414, + "C": 0.7764080762863159, + "D": 0.02656722255051136 }, "sample": { "messages": [ @@ -77317,10 +77317,10 @@ ] }, "predict": { - "A": 0.03684602305293083, - "B": 0.950271487236023, - "C": 0.00015058126882649958, - "D": 0.0006748584564775229 + "A": 0.3454747200012207, + "B": 0.5695914626121521, + "C": 0.00013979467621538788, + "D": 0.00040450927917845547 }, "sample": { "messages": [ @@ -77362,10 +77362,10 @@ ] }, "predict": { - "A": 0.9694035053253174, - "B": 0.0044892290607094765, - "C": 0.000368498353054747, - "D": 0.0011350547429174185 + "A": 0.9385427832603455, + "B": 0.0005525719607248902, + "C": 0.00013124736142344773, + "D": 0.00040427030762657523 }, "sample": { "messages": [ @@ -77407,10 +77407,10 @@ ] }, "predict": { - "A": 0.07983370870351791, - "B": 0.10250851511955261, - "C": 0.7574411630630493, - "D": 0.037710774689912796 + "A": 0.027698570862412453, + "B": 0.019036930054426193, + "C": 0.9172506928443909, + "D": 0.016800032928586006 }, "sample": { "messages": [ @@ -77452,10 +77452,10 @@ ] }, "predict": { - "A": 0.7490200996398926, - "B": 0.0372915156185627, - "C": 0.007343135308474302, - "D": 0.16712898015975952 + "A": 0.6764383316040039, + "B": 0.005164670292288065, + "C": 0.003549622604623437, + "D": 0.2196073830127716 }, "sample": { "messages": [ @@ -77490,17 +77490,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "B" ] }, "predict": { - "A": 0.11211511492729187, - "B": 0.39132019877433777, - "C": 0.44342389702796936, - "D": 0.01339024119079113 + "A": 0.11555301398038864, + "B": 0.6649617552757263, + "C": 0.11555301398038864, + "D": 0.012179199606180191 }, "sample": { "messages": [ @@ -77530,7 +77530,7 @@ "prompt_len": 71, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " B" } } { @@ -77542,10 +77542,10 @@ ] }, "predict": { - "A": 0.009342381730675697, - "B": 0.022411208599805832, - "C": 0.0012643537484109402, - "D": 0.952948808670044 + "A": 0.03203437849879265, + "B": 0.015131969004869461, + "C": 0.0018072560196742415, + "D": 0.9361817836761475 }, "sample": { "messages": [ @@ -77587,10 +77587,10 @@ ] }, "predict": { - "A": 0.01925014704465866, - "B": 0.9275240302085876, - "C": 0.009093126282095909, - "D": 0.007081733085215092 + "A": 0.047536756843328476, + "B": 0.842609167098999, + "C": 0.012019173242151737, + "D": 0.0093605425208807 }, "sample": { "messages": [ @@ -77632,10 +77632,10 @@ ] }, "predict": { - "A": 0.469716340303421, - "B": 0.3658154606819153, - "C": 0.010377385653555393, - "D": 0.11876289546489716 + "A": 0.8100550770759583, + "B": 0.01681215688586235, + "C": 0.008998899720609188, + "D": 0.07534685730934143 }, "sample": { "messages": [ @@ -77677,10 +77677,10 @@ ] }, "predict": { - "A": 0.05964169651269913, - "B": 0.0033647543750703335, - "C": 0.18370935320854187, - "D": 0.7265846133232117 + "A": 0.06632131338119507, + "B": 0.0017674033297225833, + "C": 0.1590965837240219, + "D": 0.7130213975906372 }, "sample": { "messages": [ @@ -77722,10 +77722,10 @@ ] }, "predict": { - "A": 0.2437787801027298, - "B": 0.2437787801027298, - "C": 0.401923269033432, - "D": 0.06984378397464752 + "A": 0.25980138778686523, + "B": 0.05115792900323868, + "C": 0.6232311725616455, + "D": 0.014656992629170418 }, "sample": { "messages": [ @@ -77760,17 +77760,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.00838522519916296, - "B": 0.855316162109375, - "C": 0.054678529500961304, - "D": 0.010766841471195221 + "A": 0.004309393465518951, + "B": 0.41293787956237793, + "C": 0.4679199159145355, + "D": 0.005890242289751768 }, "sample": { "messages": [ @@ -77800,7 +77800,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -77812,10 +77812,10 @@ ] }, "predict": { - "A": 0.20974615216255188, - "B": 0.7320860028266907, - "C": 0.015193959698081017, - "D": 0.0043531423434615135 + "A": 0.22960756719112396, + "B": 0.7072411179542542, + "C": 0.004765353165566921, + "D": 0.0018661391222849488 }, "sample": { "messages": [ @@ -77857,10 +77857,10 @@ ] }, "predict": { - "A": 0.014004537835717201, - "B": 0.10348031669855118, - "C": 0.8664301037788391, - "D": 0.0010144852567464113 + "A": 0.0055802734568715096, + "B": 0.03638795390725136, + "C": 0.938457727432251, + "D": 0.00045805677655152977 }, "sample": { "messages": [ @@ -77902,10 +77902,10 @@ ] }, "predict": { - "A": 0.036487679928541183, - "B": 0.0028136279433965683, - "C": 0.9410297274589539, - "D": 0.010453896597027779 + "A": 0.058412808924913406, + "B": 0.0006489077350124717, + "C": 0.9137300848960876, + "D": 0.01896386221051216 }, "sample": { "messages": [ @@ -77947,10 +77947,10 @@ ] }, "predict": { - "A": 0.002986778039485216, - "B": 0.005580035503953695, - "C": 0.9384177327156067, - "D": 0.046721067279577255 + "A": 0.00584503123536706, + "B": 0.002436571754515171, + "C": 0.9829831123352051, + "D": 0.002150266896933317 }, "sample": { "messages": [ @@ -77992,10 +77992,10 @@ ] }, "predict": { - "A": 0.005096137057989836, - "B": 0.9711517691612244, - "C": 0.0010034887818619609, - "D": 0.009520837105810642 + "A": 0.003906198777258396, + "B": 0.955815315246582, + "C": 0.000871590746100992, + "D": 0.012031939812004566 }, "sample": { "messages": [ @@ -78037,10 +78037,10 @@ ] }, "predict": { - "A": 0.03196289390325546, - "B": 0.0055543179623782635, - "C": 0.004901668522506952, - "D": 0.93409264087677 + "A": 0.01850963942706585, + "B": 0.0008657073485665023, + "C": 0.02097417041659355, + "D": 0.8918443918228149 }, "sample": { "messages": [ @@ -78082,10 +78082,10 @@ ] }, "predict": { - "A": 0.0012996402801945806, - "B": 0.9795443415641785, - "C": 3.924572956748307e-05, - "D": 5.0392522098263726e-05 + "A": 0.0013601795071735978, + "B": 0.9047120809555054, + "C": 3.4051430702675134e-05, + "D": 6.771936023142189e-05 }, "sample": { "messages": [ @@ -78127,10 +78127,10 @@ ] }, "predict": { - "A": 0.10382720828056335, - "B": 0.014051483944058418, - "C": 0.8693345189094543, - "D": 0.0008982812869362533 + "A": 0.08144395053386688, + "B": 0.006685326807200909, + "C": 0.875605046749115, + "D": 0.0004273786908015609 }, "sample": { "messages": [ @@ -78165,17 +78165,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "D" ] }, "predict": { - "A": 0.5402981638908386, - "B": 0.03048153966665268, - "C": 0.02373904548585415, - "D": 0.37134116888046265 + "A": 0.3764384984970093, + "B": 0.030899951234459877, + "C": 0.02123720571398735, + "D": 0.4833565652370453 }, "sample": { "messages": [ @@ -78205,22 +78205,22 @@ "prompt_len": 86, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "A" + "D" ] }, "predict": { - "A": 0.32035499811172485, - "B": 0.04335533455014229, - "C": 0.28271231055259705, - "D": 0.28271231055259705 + "A": 0.363180011510849, + "B": 0.06311122328042984, + "C": 0.09182628989219666, + "D": 0.4115368723869324 }, "sample": { "messages": [ @@ -78250,7 +78250,7 @@ "prompt_len": 93, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -78262,10 +78262,10 @@ ] }, "predict": { - "A": 0.0034685467835515738, - "B": 0.9617318511009216, - "C": 0.009428488090634346, - "D": 0.009428488090634346 + "A": 0.004125280771404505, + "B": 0.6122459173202515, + "C": 0.32771164178848267, + "D": 0.0063893599435687065 }, "sample": { "messages": [ @@ -78307,10 +78307,10 @@ ] }, "predict": { - "A": 0.8851789236068726, - "B": 0.07265990972518921, - "C": 0.004644992761313915, - "D": 0.005964288953691721 + "A": 0.9244946241378784, + "B": 0.01027020812034607, + "C": 0.0025967152323573828, + "D": 0.0021527523640543222 }, "sample": { "messages": [ @@ -78352,10 +78352,10 @@ ] }, "predict": { - "A": 0.930167019367218, - "B": 0.024788111448287964, - "C": 0.0012341274414211512, - "D": 0.0005829604924656451 + "A": 0.8911089301109314, + "B": 0.001720244879834354, + "C": 0.00026380812050774693, + "D": 0.000132651039166376 }, "sample": { "messages": [ @@ -78390,17 +78390,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.27990594506263733, - "B": 0.6714594960212708, - "C": 0.003109474666416645, - "D": 0.001664381823502481 + "A": 0.8195103406906128, + "B": 0.07622633129358292, + "C": 0.0011574358213692904, + "D": 0.0013115466572344303 }, "sample": { "messages": [ @@ -78430,7 +78430,7 @@ "prompt_len": 70, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -78442,10 +78442,10 @@ ] }, "predict": { - "A": 0.008233859203755856, - "B": 0.951704740524292, - "C": 0.0013441406190395355, - "D": 0.019752001389861107 + "A": 0.001564494799822569, + "B": 0.9183355569839478, + "C": 0.0017728047678247094, + "D": 0.024472814053297043 }, "sample": { "messages": [ @@ -78487,10 +78487,10 @@ ] }, "predict": { - "A": 0.031939420849084854, - "B": 0.05265919864177704, - "C": 0.059670694172382355, - "D": 0.8237285017967224 + "A": 0.018326448276638985, + "B": 0.009809441864490509, + "C": 0.02666482701897621, + "D": 0.8830177783966064 }, "sample": { "messages": [ @@ -78532,10 +78532,10 @@ ] }, "predict": { - "A": 0.005811610724776983, - "B": 0.005128728691488504, - "C": 0.9773626923561096, - "D": 0.00018681390793062747 + "A": 0.004006164614111185, + "B": 0.001147785340435803, + "C": 0.9802761077880859, + "D": 0.00015533584519289434 }, "sample": { "messages": [ @@ -78577,10 +78577,10 @@ ] }, "predict": { - "A": 0.9494791030883789, - "B": 0.017390316352248192, - "C": 0.0016175515484064817, - "D": 0.0038803040515631437 + "A": 0.884615421295166, + "B": 0.005960491951555014, + "C": 0.001035778084769845, + "D": 0.0019350884249433875 }, "sample": { "messages": [ @@ -78622,10 +78622,10 @@ ] }, "predict": { - "A": 0.040592677891254425, - "B": 0.024620704352855682, - "C": 0.0006561195477843285, - "D": 0.9238851070404053 + "A": 0.017348850145936012, + "B": 0.001337799709290266, + "C": 0.0003177552716806531, + "D": 0.9472150802612305 }, "sample": { "messages": [ @@ -78667,10 +78667,10 @@ ] }, "predict": { - "A": 0.0037129041738808155, - "B": 0.20271770656108856, - "C": 0.06581279635429382, - "D": 0.707554280757904 + "A": 0.0022601112723350525, + "B": 0.11592159420251846, + "C": 0.07030999660491943, + "D": 0.7559037208557129 }, "sample": { "messages": [ @@ -78712,10 +78712,10 @@ ] }, "predict": { - "A": 0.00013732067600358278, - "B": 0.00017632321396376938, - "C": 0.0027581595350056887, - "D": 0.9819726943969727 + "A": 3.0270564820966683e-05, + "B": 4.688396802521311e-05, + "C": 0.000325439206790179, + "D": 0.9701206684112549 }, "sample": { "messages": [ @@ -78757,10 +78757,10 @@ ] }, "predict": { - "A": 0.009455079212784767, - "B": 0.007363622542470694, - "C": 0.0027089256327599287, - "D": 0.9644442796707153 + "A": 0.002386705484241247, + "B": 0.0006838025292381644, + "C": 0.0005325459642335773, + "D": 0.9628656506538391 }, "sample": { "messages": [ @@ -78802,10 +78802,10 @@ ] }, "predict": { - "A": 0.00037727379822172225, - "B": 0.0009050327935256064, - "C": 0.9924889802932739, - "D": 0.0007048402330838144 + "A": 8.963599248090759e-05, + "B": 0.00015731605526525527, + "C": 0.99277263879776, + "D": 0.00013883094652555883 }, "sample": { "messages": [ @@ -78847,10 +78847,10 @@ ] }, "predict": { - "A": 0.11166957020759583, - "B": 0.8251327276229858, - "C": 0.028234489262104034, - "D": 0.013337028212845325 + "A": 0.1431477814912796, + "B": 0.7269644737243652, + "C": 0.04647328332066536, + "D": 0.009151131846010685 }, "sample": { "messages": [ @@ -78892,10 +78892,10 @@ ] }, "predict": { - "A": 0.004573292098939419, - "B": 0.0013102700468152761, - "C": 0.9875560402870178, - "D": 9.49156383285299e-05 + "A": 0.0035659484565258026, + "B": 0.00018898832786362618, + "C": 0.9887386560440063, + "D": 3.961410402553156e-05 }, "sample": { "messages": [ @@ -78937,10 +78937,10 @@ ] }, "predict": { - "A": 0.0073754675686359406, - "B": 0.010731243528425694, - "C": 0.9659956693649292, - "D": 0.0073754675686359406 + "A": 0.006533846724778414, + "B": 0.005088564939796925, + "C": 0.9697088599205017, + "D": 0.00740381795912981 }, "sample": { "messages": [ @@ -78982,10 +78982,10 @@ ] }, "predict": { - "A": 0.015607325360178947, - "B": 0.9655910730361938, - "C": 0.0021122219040989876, - "D": 0.002393461065366864 + "A": 0.0072277504950761795, + "B": 0.9466485977172852, + "C": 0.0002632708055898547, + "D": 0.0020707855001091957 }, "sample": { "messages": [ @@ -79027,10 +79027,10 @@ ] }, "predict": { - "A": 0.0018894439563155174, - "B": 0.9787561893463135, - "C": 0.0005413347389549017, - "D": 0.0008925101137720048 + "A": 0.0003646930563263595, + "B": 0.9012662768363953, + "C": 0.0002077958342852071, + "D": 0.00028402323368936777 }, "sample": { "messages": [ @@ -79072,10 +79072,10 @@ ] }, "predict": { - "A": 0.0012978236190974712, - "B": 0.008462873287498951, - "C": 0.978175163269043, - "D": 0.0016664386494085193 + "A": 0.0003120095352642238, + "B": 0.0006605241796933115, + "C": 0.9900727272033691, + "D": 0.0002931058406829834 }, "sample": { "messages": [ @@ -79117,10 +79117,10 @@ ] }, "predict": { - "A": 0.0008963205618783832, - "B": 0.9829348921775818, - "C": 0.00012912723468616605, - "D": 8.33707963465713e-05 + "A": 0.0022503903601318598, + "B": 0.9078722596168518, + "C": 0.00016301772848237306, + "D": 0.00011926632578251883 }, "sample": { "messages": [ @@ -79162,10 +79162,10 @@ ] }, "predict": { - "A": 0.5548157095909119, - "B": 0.1402793824672699, - "C": 0.231281578540802, - "D": 0.040190715342760086 + "A": 0.6091354489326477, + "B": 0.11994587630033493, + "C": 0.17452023923397064, + "D": 0.01839429698884487 }, "sample": { "messages": [ @@ -79207,10 +79207,10 @@ ] }, "predict": { - "A": 0.20667153596878052, - "B": 0.30070528388023376, - "C": 0.340743750333786, - "D": 0.1253526359796524 + "A": 0.3377998471260071, + "B": 0.10966755449771881, + "C": 0.43374359607696533, + "D": 0.0753733292222023 }, "sample": { "messages": [ @@ -79248,14 +79248,14 @@ "acc": false, "f1_macro": [ "B", - "A" + "C" ] }, "predict": { - "A": 0.3677016496658325, - "B": 0.3244955539703369, - "C": 0.1532808542251587, - "D": 0.07240474224090576 + "A": 0.11659527570009232, + "B": 0.2178286612033844, + "C": 0.46114325523376465, + "D": 0.09080448746681213 }, "sample": { "messages": [ @@ -79285,7 +79285,7 @@ "prompt_len": 69, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -79297,10 +79297,10 @@ ] }, "predict": { - "A": 0.005836331285536289, - "B": 0.003123963251709938, - "C": 0.9815199971199036, - "D": 0.00047907530097290874 + "A": 0.004023449961096048, + "B": 0.0003102551563642919, + "C": 0.9845057129859924, + "D": 6.922728061908856e-05 }, "sample": { "messages": [ @@ -79342,10 +79342,10 @@ ] }, "predict": { - "A": 0.005080610979348421, - "B": 0.0021179139148443937, - "C": 0.9681931138038635, - "D": 0.0018690524157136679 + "A": 0.0005375968175940216, + "B": 0.00041868083644658327, + "C": 0.971997857093811, + "D": 0.0001977708307094872 }, "sample": { "messages": [ @@ -79387,10 +79387,10 @@ ] }, "predict": { - "A": 0.00037498073652386665, - "B": 0.0011550219496712089, - "C": 0.00020071271865162998, - "D": 0.9864566326141357 + "A": 0.0003396649262867868, + "B": 0.0003396649262867868, + "C": 0.00017079425742849708, + "D": 0.9511809945106506 }, "sample": { "messages": [ @@ -79425,17 +79425,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.30653589963912964, - "B": 0.5726844668388367, - "C": 0.0532679483294487, - "D": 0.010489081963896751 + "A": 0.5599784255027771, + "B": 0.33964410424232483, + "C": 0.009051208384335041, + "D": 0.001477569225244224 }, "sample": { "messages": [ @@ -79465,7 +79465,7 @@ "prompt_len": 72, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -79477,10 +79477,10 @@ ] }, "predict": { - "A": 0.007445001509040594, - "B": 0.9751028418540955, - "C": 0.00019840258755721152, - "D": 0.0005393141182139516 + "A": 0.014308599755167961, + "B": 0.885241687297821, + "C": 0.0001237936521647498, + "D": 0.0003365060256328434 }, "sample": { "messages": [ @@ -79522,10 +79522,10 @@ ] }, "predict": { - "A": 0.8348587155342102, - "B": 0.009274443611502647, - "C": 0.06852937489748001, - "D": 0.05337073281407356 + "A": 0.8206594586372375, + "B": 0.0016864205244928598, + "C": 0.06736382842063904, + "D": 0.010330578312277794 }, "sample": { "messages": [ @@ -79567,10 +79567,10 @@ ] }, "predict": { - "A": 0.0012186879757791758, - "B": 0.8106001019477844, - "C": 0.09681238234043121, - "D": 0.05871967971324921 + "A": 0.000732649932615459, + "B": 0.8034482002258301, + "C": 0.08468281477689743, + "D": 0.02426203340291977 }, "sample": { "messages": [ @@ -79612,10 +79612,10 @@ ] }, "predict": { - "A": 0.015416169539093971, - "B": 0.019794752821326256, - "C": 0.9537646174430847, - "D": 0.0002823570102918893 + "A": 0.00748433405533433, + "B": 0.0024298077914863825, + "C": 0.9802543520927429, + "D": 0.0001002901335596107 }, "sample": { "messages": [ @@ -79657,10 +79657,10 @@ ] }, "predict": { - "A": 0.00035128300078213215, - "B": 0.00029122387059032917, - "C": 6.10438291914761e-05, - "D": 0.9837157130241394 + "A": 0.0006344031426124275, + "B": 0.00015068394714035094, + "C": 5.207497815717943e-05, + "D": 0.9509194493293762 }, "sample": { "messages": [ @@ -79702,10 +79702,10 @@ ] }, "predict": { - "A": 0.03188860043883324, - "B": 0.011731160804629326, - "C": 0.9319215416908264, - "D": 0.01934141479432583 + "A": 0.020108258351683617, + "B": 0.0010011312551796436, + "C": 0.968870222568512, + "D": 0.006528195925056934 }, "sample": { "messages": [ @@ -79747,10 +79747,10 @@ ] }, "predict": { - "A": 0.0014713644050061703, - "B": 0.004532122053205967, - "C": 0.0004215529770590365, - "D": 0.9786657691001892 + "A": 0.0006451695808209479, + "B": 0.0018668599659577012, + "C": 0.0002862922556232661, + "D": 0.9670574069023132 }, "sample": { "messages": [ @@ -79792,10 +79792,10 @@ ] }, "predict": { - "A": 0.0005406595300883055, - "B": 0.9775353074073792, - "C": 3.0501923902193084e-05, - "D": 0.0004771303792949766 + "A": 0.0003542322665452957, + "B": 0.9318739175796509, + "C": 2.2645310309599154e-05, + "D": 0.0003542322665452957 }, "sample": { "messages": [ @@ -79837,10 +79837,10 @@ ] }, "predict": { - "A": 0.6532642841339111, - "B": 0.308580219745636, - "C": 0.0004639315593522042, - "D": 0.0004639315593522042 + "A": 0.9018716812133789, + "B": 0.011352889239788055, + "C": 0.00046858968562446535, + "D": 0.0002669943787623197 }, "sample": { "messages": [ @@ -79882,10 +79882,10 @@ ] }, "predict": { - "A": 0.001484856940805912, - "B": 0.0027740781661123037, - "C": 0.9876402020454407, - "D": 0.003143442329019308 + "A": 0.0005419210065156221, + "B": 0.0027521022129803896, + "C": 0.979816198348999, + "D": 0.00024047601618804038 }, "sample": { "messages": [ @@ -79927,10 +79927,10 @@ ] }, "predict": { - "A": 0.0021302038803696632, - "B": 0.0010062369983643293, - "C": 0.0006496754358522594, - "D": 0.9738113880157471 + "A": 0.0011057296069338918, + "B": 0.00014964422734919935, + "C": 0.00019214699568692595, + "D": 0.9443580508232117 }, "sample": { "messages": [ @@ -79972,10 +79972,10 @@ ] }, "predict": { - "A": 0.9482584595680237, - "B": 0.005638550501316786, - "C": 0.0005942988209426403, - "D": 0.0018305694684386253 + "A": 0.8338094353675842, + "B": 0.0006709939916618168, + "C": 0.0002178398281103, + "D": 0.00043322626152075827 }, "sample": { "messages": [ @@ -80017,10 +80017,10 @@ ] }, "predict": { - "A": 0.002770679770037532, - "B": 0.0011549910996109247, - "C": 0.986430287361145, - "D": 0.001904258388094604 + "A": 0.000792411738075316, + "B": 0.00017681095050647855, + "C": 0.9846889972686768, + "D": 0.0002003530680667609 }, "sample": { "messages": [ @@ -80062,10 +80062,10 @@ ] }, "predict": { - "A": 0.4381568729877472, - "B": 0.38667207956314087, - "C": 0.1107834056019783, - "D": 0.013231190852820873 + "A": 0.4574756324291229, + "B": 0.2160961925983429, + "C": 0.19070421159267426, + "D": 0.005758768413215876 }, "sample": { "messages": [ @@ -80107,10 +80107,10 @@ ] }, "predict": { - "A": 0.0865384191274643, - "B": 0.821053683757782, - "C": 0.01171170175075531, - "D": 0.046320684254169464 + "A": 0.03765247389674187, + "B": 0.756270170211792, + "C": 0.06207843869924545, + "D": 0.06207843869924545 }, "sample": { "messages": [ @@ -80152,10 +80152,10 @@ ] }, "predict": { - "A": 0.6465474367141724, - "B": 0.0601382777094841, - "C": 0.23785154521465302, - "D": 0.011841930449008942 + "A": 0.8464817404747009, + "B": 0.007323502097278833, + "C": 0.042143844068050385, + "D": 0.0011230953969061375 }, "sample": { "messages": [ @@ -80197,10 +80197,10 @@ ] }, "predict": { - "A": 0.028638102114200592, - "B": 0.73858642578125, - "C": 0.017369886860251427, - "D": 0.18674388527870178 + "A": 0.028799107298254967, + "B": 0.5104763507843018, + "C": 0.03697878494858742, + "D": 0.39755940437316895 }, "sample": { "messages": [ @@ -80242,10 +80242,10 @@ ] }, "predict": { - "A": 0.003482550149783492, - "B": 0.9656146168708801, - "C": 0.0003239276120439172, - "D": 0.017685849219560623 + "A": 0.0049199857749044895, + "B": 0.9375833868980408, + "C": 0.0006255060434341431, + "D": 0.009191744029521942 }, "sample": { "messages": [ @@ -80287,10 +80287,10 @@ ] }, "predict": { - "A": 0.003571487730368972, - "B": 0.00048334835446439683, - "C": 0.9902745485305786, - "D": 0.00013848161324858665 + "A": 0.0013186904834583402, + "B": 3.9820995880290866e-05, + "C": 0.9939025640487671, + "D": 2.7368549126549624e-05 }, "sample": { "messages": [ @@ -80332,10 +80332,10 @@ ] }, "predict": { - "A": 0.0007885099621489644, - "B": 0.0002559916756581515, - "C": 0.00017594033852219582, - "D": 0.9798404574394226 + "A": 0.0005725982482545078, + "B": 0.00012776395305991173, + "C": 0.00012002313451375812, + "D": 0.9136331677436829 }, "sample": { "messages": [ @@ -80377,10 +80377,10 @@ ] }, "predict": { - "A": 0.11886441707611084, - "B": 0.08169423043727875, - "C": 0.7750933170318604, - "D": 0.002795423148199916 + "A": 0.09095633029937744, + "B": 0.026059426367282867, + "C": 0.862969696521759, + "D": 0.0012974223354831338 }, "sample": { "messages": [ @@ -80422,10 +80422,10 @@ ] }, "predict": { - "A": 0.49344775080680847, - "B": 0.03154505789279938, - "C": 0.2641235589981079, - "D": 0.14137515425682068 + "A": 0.3822895884513855, + "B": 0.00957043282687664, + "C": 0.3822895884513855, + "D": 0.06643196195363998 }, "sample": { "messages": [ @@ -80455,7 +80455,7 @@ "prompt_len": 70, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -80467,10 +80467,10 @@ ] }, "predict": { - "A": 0.0018785239662975073, - "B": 0.010810159146785736, - "C": 0.9730994701385498, - "D": 0.00041915534529834986 + "A": 0.0007455019513145089, + "B": 0.0013083966914564371, + "C": 0.9861440658569336, + "D": 0.00015626569802407175 }, "sample": { "messages": [ @@ -80512,10 +80512,10 @@ ] }, "predict": { - "A": 0.1292833834886551, - "B": 0.04756070300936699, - "C": 0.7439745664596558, - "D": 0.0326879620552063 + "A": 0.11717027425765991, + "B": 0.1327713280916214, + "C": 0.5950397849082947, + "D": 0.04884384199976921 }, "sample": { "messages": [ @@ -80557,10 +80557,10 @@ ] }, "predict": { - "A": 0.32459017634391785, - "B": 0.36780887842178345, - "C": 0.2527911067008972, - "D": 0.005945076700299978 + "A": 0.002808365738019347, + "B": 0.9392697811126709, + "C": 0.0016001587500795722, + "D": 0.0007558614015579224 }, "sample": { "messages": [ @@ -80602,10 +80602,10 @@ ] }, "predict": { - "A": 0.024742674082517624, - "B": 0.03177022188901901, - "C": 0.928462028503418, - "D": 0.0010871172416955233 + "A": 0.0065781897865235806, + "B": 0.005123099312186241, + "C": 0.9762899279594421, + "D": 0.00032750878017395735 }, "sample": { "messages": [ @@ -80647,10 +80647,10 @@ ] }, "predict": { - "A": 0.0010028210235759616, - "B": 0.0006892281235195696, - "C": 0.00019746717589441687, - "D": 0.9705054759979248 + "A": 0.0002446397265885025, + "B": 0.00011555962555576116, + "C": 8.454522321699187e-05, + "D": 0.8796563744544983 }, "sample": { "messages": [ @@ -80692,10 +80692,10 @@ ] }, "predict": { - "A": 0.004489882383495569, - "B": 0.969544529914856, - "C": 0.00025330178323201835, - "D": 0.0001536353083793074 + "A": 0.0010876883752644062, + "B": 0.8726674914360046, + "C": 0.00035312067484483123, + "D": 0.00025834862026385963 }, "sample": { "messages": [ @@ -80737,10 +80737,10 @@ ] }, "predict": { - "A": 0.008137558586895466, - "B": 0.03647000715136528, - "C": 0.9405738711357117, - "D": 0.0011012987233698368 + "A": 0.0027409950271248817, + "B": 0.005120852496474981, + "C": 0.9758617281913757, + "D": 0.00015463633462786674 }, "sample": { "messages": [ @@ -80782,10 +80782,10 @@ ] }, "predict": { - "A": 0.03669624403119087, - "B": 0.09975072741508484, - "C": 0.8352026343345642, - "D": 0.010513649322092533 + "A": 0.01216031238436699, + "B": 0.00650894595310092, + "C": 0.9660132527351379, + "D": 0.0008808900602161884 }, "sample": { "messages": [ @@ -80827,10 +80827,10 @@ ] }, "predict": { - "A": 0.017451398074626923, - "B": 0.004412404727190733, - "C": 0.000766760902479291, - "D": 0.9528140425682068 + "A": 0.004892031662166119, + "B": 0.0012368992902338505, + "C": 0.00024355988716706634, + "D": 0.9322561621665955 }, "sample": { "messages": [ @@ -80865,17 +80865,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "D" ] }, "predict": { - "A": 0.02514028362929821, - "B": 0.5721902251243591, - "C": 0.02514028362929821, - "D": 0.3470509350299835 + "A": 0.007679506205022335, + "B": 0.154246985912323, + "C": 0.014347205869853497, + "D": 0.7833308577537537 }, "sample": { "messages": [ @@ -80905,22 +80905,22 @@ "prompt_len": 73, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " D" } } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "A" ] }, "predict": { - "A": 0.2406083643436432, - "B": 0.2406083643436432, - "C": 0.44951558113098145, - "D": 0.032562799751758575 + "A": 0.34543362259864807, + "B": 0.20951607823371887, + "C": 0.30484408140182495, + "D": 0.02502312883734703 }, "sample": { "messages": [ @@ -80950,22 +80950,22 @@ "prompt_len": 65, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "C" ] }, "predict": { - "A": 0.06623877584934235, - "B": 0.021504582837224007, - "C": 0.1800556629896164, - "D": 0.7121340036392212 + "A": 0.03120146133005619, + "B": 0.004784898832440376, + "C": 0.7101419568061829, + "D": 0.23054933547973633 }, "sample": { "messages": [ @@ -80995,7 +80995,7 @@ "prompt_len": 73, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " C" } } { @@ -81007,10 +81007,10 @@ ] }, "predict": { - "A": 0.00012937434075865895, - "B": 0.00021330220624804497, - "C": 1.8638140318216756e-05, - "D": 0.9848159551620483 + "A": 6.40939106233418e-05, + "B": 3.887492130161263e-05, + "C": 1.3434814718493726e-05, + "D": 0.9702891707420349 }, "sample": { "messages": [ @@ -81052,10 +81052,10 @@ ] }, "predict": { - "A": 0.0024125755298882723, - "B": 0.0030978082213550806, - "C": 0.0012131191324442625, - "D": 0.9733023643493652 + "A": 0.0019364829640835524, + "B": 0.00038131611654534936, + "C": 0.0011033747578039765, + "D": 0.9423468708992004 }, "sample": { "messages": [ @@ -81097,10 +81097,10 @@ ] }, "predict": { - "A": 0.465762197971344, - "B": 0.10392559319734573, - "C": 0.08093732595443726, - "D": 0.19415856897830963 + "A": 0.41521504521369934, + "B": 0.03408292680978775, + "C": 0.28537285327911377, + "D": 0.10498280823230743 }, "sample": { "messages": [ @@ -81138,14 +81138,14 @@ "acc": false, "f1_macro": [ "D", - "A" + "C" ] }, "predict": { - "A": 0.28333115577697754, - "B": 0.13383617997169495, - "C": 0.28333115577697754, - "D": 0.25003886222839355 + "A": 0.31033939123153687, + "B": 0.028866061940789223, + "C": 0.3516606092453003, + "D": 0.2132929414510727 }, "sample": { "messages": [ @@ -81187,10 +81187,10 @@ ] }, "predict": { - "A": 0.06519010663032532, - "B": 0.009997227229177952, - "C": 0.0047223553992807865, - "D": 0.8999216556549072 + "A": 0.07228720933198929, + "B": 0.0013239863328635693, + "C": 0.001409376272931695, + "D": 0.8806384801864624 }, "sample": { "messages": [ @@ -81225,17 +81225,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.25765082240104675, - "B": 0.42479437589645386, - "C": 0.17708063125610352, - "D": 0.04477299377322197 + "A": 0.22070670127868652, + "B": 0.04345972463488579, + "C": 0.6798244714736938, + "D": 0.010322589427232742 }, "sample": { "messages": [ @@ -81265,7 +81265,7 @@ "prompt_len": 71, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -81277,10 +81277,10 @@ ] }, "predict": { - "A": 0.9428794384002686, - "B": 0.002648354507982731, - "C": 0.004366398323327303, - "D": 0.0001693036174401641 + "A": 0.9011725783348083, + "B": 0.0004132083849981427, + "C": 0.0028682348784059286, + "D": 7.643586286576465e-05 }, "sample": { "messages": [ @@ -81322,10 +81322,10 @@ ] }, "predict": { - "A": 0.004993796348571777, - "B": 0.02536058984696865, - "C": 0.00031924271024763584, - "D": 0.9516491293907166 + "A": 0.001528420252725482, + "B": 0.004422630649060011, + "C": 0.0002827293355949223, + "D": 0.9550222754478455 }, "sample": { "messages": [ @@ -81367,10 +81367,10 @@ ] }, "predict": { - "A": 0.007297853473573923, - "B": 0.0017333922442048788, - "C": 0.006440333556383848, - "D": 0.9558302164077759 + "A": 0.00170992873609066, + "B": 0.00026222606538794935, + "C": 0.0010371241951361299, + "D": 0.9428919553756714 }, "sample": { "messages": [ @@ -81412,10 +81412,10 @@ ] }, "predict": { - "A": 0.004034444689750671, - "B": 0.0015799113316461444, - "C": 0.9871960878372192, - "D": 0.0002745474048424512 + "A": 0.0016903068171814084, + "B": 0.0002759349881671369, + "C": 0.9921854138374329, + "D": 6.156941526569426e-05 }, "sample": { "messages": [ @@ -81457,10 +81457,10 @@ ] }, "predict": { - "A": 0.1130104511976242, - "B": 0.015294302254915237, - "C": 0.0023454572074115276, - "D": 0.8350405693054199 + "A": 0.1376759111881256, + "B": 0.0028573726303875446, + "C": 0.0014367771800607443, + "D": 0.7922702431678772 }, "sample": { "messages": [ @@ -81502,10 +81502,10 @@ ] }, "predict": { - "A": 0.006585539318621159, - "B": 0.004526170901954174, - "C": 0.977380633354187, - "D": 0.000612550531513989 + "A": 0.002756137400865555, + "B": 0.0003504035121295601, + "C": 0.9812528491020203, + "D": 9.430983482161537e-05 }, "sample": { "messages": [ @@ -81547,10 +81547,10 @@ ] }, "predict": { - "A": 0.8436739444732666, - "B": 0.06111554056406021, - "C": 0.04759683087468147, - "D": 0.013636719435453415 + "A": 0.9036835432052612, + "B": 0.010039018467068672, + "C": 0.007818394340574741, + "D": 0.0028762267902493477 }, "sample": { "messages": [ @@ -81592,10 +81592,10 @@ ] }, "predict": { - "A": 0.9324733018875122, - "B": 0.0033630237448960543, - "C": 0.002311370335519314, - "D": 0.0006622186629101634 + "A": 0.8329300880432129, + "B": 0.001105115283280611, + "C": 0.001105115283280611, + "D": 0.00013198719534557313 }, "sample": { "messages": [ @@ -81637,10 +81637,10 @@ ] }, "predict": { - "A": 0.9425501823425293, - "B": 0.004946049302816391, - "C": 0.0012505571357905865, - "D": 0.002336348406970501 + "A": 0.9015952348709106, + "B": 0.0011962188873440027, + "C": 0.0005308189429342747, + "D": 0.0008221484604291618 }, "sample": { "messages": [ @@ -81682,10 +81682,10 @@ ] }, "predict": { - "A": 0.0004792870895471424, - "B": 0.0002565438626334071, - "C": 0.0002907023299485445, - "D": 0.9819539785385132 + "A": 0.0002731737622525543, + "B": 6.0953301726840436e-05, + "C": 0.00024107497301883996, + "D": 0.922744870185852 }, "sample": { "messages": [ @@ -81727,10 +81727,10 @@ ] }, "predict": { - "A": 0.9453572034835815, - "B": 0.009267956018447876, - "C": 0.013484795577824116, - "D": 0.0007607601583003998 + "A": 0.9002389907836914, + "B": 0.0019692648202180862, + "C": 0.0015336651122197509, + "D": 0.0003019965370185673 }, "sample": { "messages": [ @@ -81772,10 +81772,10 @@ ] }, "predict": { - "A": 0.0010960515355691314, - "B": 0.6433662176132202, - "C": 0.0004032147699035704, - "D": 0.34436914324760437 + "A": 0.0002771519066300243, + "B": 0.6849257946014404, + "C": 0.00013091728033032268, + "D": 0.285519540309906 }, "sample": { "messages": [ @@ -81817,10 +81817,10 @@ ] }, "predict": { - "A": 0.005766516551375389, - "B": 0.9697790145874023, - "C": 0.0021213830914348364, - "D": 0.003963265102356672 + "A": 0.008195810951292515, + "B": 0.9473069906234741, + "C": 0.0015160726616159081, + "D": 0.008195810951292515 }, "sample": { "messages": [ @@ -81862,10 +81862,10 @@ ] }, "predict": { - "A": 0.002447977429255843, - "B": 0.0027739217039197683, - "C": 0.9875845313072205, - "D": 0.0003312976914457977 + "A": 0.0009541634353809059, + "B": 0.0004797835717909038, + "C": 0.9829711318016052, + "D": 0.00010056808969238773 }, "sample": { "messages": [ @@ -81907,10 +81907,10 @@ ] }, "predict": { - "A": 0.0029015943873673677, - "B": 0.0010027640964835882, - "C": 0.9116538166999817, - "D": 0.07483310252428055 + "A": 0.0017133476212620735, + "B": 0.00021782770636491477, + "C": 0.944777250289917, + "D": 0.04151061549782753 }, "sample": { "messages": [ @@ -81952,10 +81952,10 @@ ] }, "predict": { - "A": 0.9714441895484924, - "B": 0.010791771113872528, - "C": 0.00011988573533017188, - "D": 0.00022397603606805205 + "A": 0.9295355081558228, + "B": 0.002610874129459262, + "C": 5.090393824502826e-05, + "D": 5.768171467934735e-05 }, "sample": { "messages": [ @@ -81997,10 +81997,10 @@ ] }, "predict": { - "A": 0.0027765342965722084, - "B": 0.0013115417677909136, - "C": 0.9885146021842957, - "D": 0.00025825787452049553 + "A": 0.0016881815390661359, + "B": 0.000201624512556009, + "C": 0.9909378290176392, + "D": 5.776638863608241e-05 }, "sample": { "messages": [ @@ -82042,10 +82042,10 @@ ] }, "predict": { - "A": 0.0009035420371219516, - "B": 0.0011601708829402924, - "C": 0.9908541440963745, - "D": 0.0002016074868151918 + "A": 0.0007018090691417456, + "B": 0.00025818112771958113, + "C": 0.9882208108901978, + "D": 8.381914085475728e-05 }, "sample": { "messages": [ @@ -82087,10 +82087,10 @@ ] }, "predict": { - "A": 0.944507360458374, - "B": 0.0038599856197834015, - "C": 0.0004068395064678043, - "D": 0.001253153895959258 + "A": 0.8793414831161499, + "B": 0.0004292024241294712, + "C": 0.0001578947267262265, + "D": 0.0003787697642110288 }, "sample": { "messages": [ @@ -82132,10 +82132,10 @@ ] }, "predict": { - "A": 0.09340605139732361, - "B": 0.78207927942276, - "C": 0.0727446973323822, - "D": 0.018392741680145264 + "A": 0.18171072006225586, + "B": 0.43590137362480164, + "C": 0.2643875479698181, + "D": 0.024591874331235886 }, "sample": { "messages": [ @@ -82177,10 +82177,10 @@ ] }, "predict": { - "A": 0.9346196055412292, - "B": 0.009162688627839088, - "C": 0.0015922365710139275, - "D": 0.013331632129848003 + "A": 0.8906905055046082, + "B": 0.0005582195008173585, + "C": 0.00028069043764844537, + "D": 0.0011101521085947752 }, "sample": { "messages": [ @@ -82222,10 +82222,10 @@ ] }, "predict": { - "A": 0.15616591274738312, - "B": 0.6998870372772217, - "C": 0.10733115673065186, - "D": 0.011312619782984257 + "A": 0.1351243257522583, + "B": 0.6862179636955261, + "C": 0.11924679577350616, + "D": 0.005239338614046574 }, "sample": { "messages": [ @@ -82267,10 +82267,10 @@ ] }, "predict": { - "A": 0.003123731818050146, - "B": 0.0013021650956943631, - "C": 0.00054282316705212, - "D": 0.9814473390579224 + "A": 0.0022095812018960714, + "B": 0.00038396765012294054, + "C": 0.00029903434915468097, + "D": 0.9488996863365173 }, "sample": { "messages": [ @@ -82312,10 +82312,10 @@ ] }, "predict": { - "A": 0.8778819441795349, - "B": 0.08165574073791504, - "C": 0.0006234492757357657, - "D": 0.002794106025248766 + "A": 0.8931171298027039, + "B": 0.021004103124141693, + "C": 0.00018172108684666455, + "D": 0.0007187208393588662 }, "sample": { "messages": [ @@ -82357,10 +82357,10 @@ ] }, "predict": { - "A": 0.006539646070450544, - "B": 0.008397071622312069, - "C": 0.002405801322311163, - "D": 0.9705694913864136 + "A": 0.0004747412749566138, + "B": 7.749937503831461e-05, + "C": 0.0003697288630064577, + "D": 0.9726406335830688 }, "sample": { "messages": [ @@ -82402,10 +82402,10 @@ ] }, "predict": { - "A": 0.036372579634189606, - "B": 0.01042091939598322, - "C": 0.9380612373352051, - "D": 0.0003565842634998262 + "A": 0.008439012803137302, + "B": 0.0010729000205174088, + "C": 0.9754172563552856, + "D": 7.301177538465708e-05 }, "sample": { "messages": [ @@ -82447,10 +82447,10 @@ ] }, "predict": { - "A": 0.0016031539998948574, - "B": 0.0019337725825607777, - "C": 0.9410279393196106, - "D": 0.0468510203063488 + "A": 0.0010747608030214906, + "B": 0.0005076810484752059, + "C": 0.977108895778656, + "D": 0.013937709853053093 }, "sample": { "messages": [ @@ -82492,10 +82492,10 @@ ] }, "predict": { - "A": 0.0002571185468696058, - "B": 0.984153687953949, - "C": 0.00020024414698127657, - "D": 0.0016766238259151578 + "A": 0.00027662317734211683, + "B": 0.934396505355835, + "C": 0.00015761512622702867, + "D": 0.0014048082521185279 }, "sample": { "messages": [ @@ -82537,10 +82537,10 @@ ] }, "predict": { - "A": 0.07298669964075089, - "B": 0.8891600370407104, - "C": 0.006788820493966341, - "D": 0.011192873120307922 + "A": 0.2025742083787918, + "B": 0.7070534229278564, + "C": 0.011428454890847206, + "D": 0.011428454890847206 }, "sample": { "messages": [ @@ -82582,10 +82582,10 @@ ] }, "predict": { - "A": 0.4853891432285309, - "B": 0.03984316810965538, - "C": 0.20234030485153198, - "D": 0.22928160429000854 + "A": 0.8354182243347168, + "B": 0.0038687540218234062, + "C": 0.025227444246411324, + "D": 0.041593022644519806 }, "sample": { "messages": [ @@ -82627,10 +82627,10 @@ ] }, "predict": { - "A": 0.9235695004463196, - "B": 0.02788938209414482, - "C": 0.007051539607346058, - "D": 0.013174010440707207 + "A": 0.7719448208808899, + "B": 0.007567881140857935, + "C": 0.018154403194785118, + "D": 0.15200494229793549 }, "sample": { "messages": [ @@ -82672,10 +82672,10 @@ ] }, "predict": { - "A": 0.6029137969017029, - "B": 0.026490183547139168, - "C": 0.2847963273525238, - "D": 0.038542989641427994 + "A": 0.702656626701355, + "B": 0.012869605794548988, + "C": 0.12210340797901154, + "D": 0.0165249016135931 }, "sample": { "messages": [ @@ -82717,10 +82717,10 @@ ] }, "predict": { - "A": 0.9558030962944031, - "B": 0.009370364248752594, - "C": 0.0012681408552452922, - "D": 0.0023691991809755564 + "A": 0.9122575521469116, + "B": 0.0010681437561288476, + "C": 0.00039294816087931395, + "D": 0.0007341238087974489 }, "sample": { "messages": [ @@ -82762,10 +82762,10 @@ ] }, "predict": { - "A": 0.0012976984726265073, - "B": 0.9780808091163635, - "C": 0.00037179686478339136, - "D": 0.001666277996264398 + "A": 0.001313333516009152, + "B": 0.9298921227455139, + "C": 0.0004005440860055387, + "D": 0.00108879164326936 }, "sample": { "messages": [ @@ -82807,10 +82807,10 @@ ] }, "predict": { - "A": 0.9512487649917603, - "B": 0.015375504270195961, - "C": 0.0011137977708131075, - "D": 0.0011137977708131075 + "A": 0.9439142942428589, + "B": 0.0014191176742315292, + "C": 0.0003588091640267521, + "D": 0.00029746326617896557 }, "sample": { "messages": [ @@ -82852,10 +82852,10 @@ ] }, "predict": { - "A": 0.0008973248186521232, - "B": 0.9840361475944519, - "C": 0.0003301073447801173, - "D": 0.0003301073447801173 + "A": 0.0013481377391144633, + "B": 0.9545348882675171, + "C": 0.0003202107618562877, + "D": 0.00026546407025307417 }, "sample": { "messages": [ @@ -82897,10 +82897,10 @@ ] }, "predict": { - "A": 0.9279545545578003, - "B": 0.004869458731263876, - "C": 0.0015808818861842155, - "D": 0.0037923383060842752 + "A": 0.7930306196212769, + "B": 0.0017347474349662662, + "C": 0.002524042734876275, + "D": 0.0013510227436199784 }, "sample": { "messages": [ @@ -82942,10 +82942,10 @@ ] }, "predict": { - "A": 0.002772378269582987, - "B": 0.00042515795212239027, - "C": 0.0002578713174443692, - "D": 0.9870349764823914 + "A": 0.0014655219856649637, + "B": 8.801143849268556e-05, + "C": 0.00041987907025031745, + "D": 0.9747796654701233 }, "sample": { "messages": [ @@ -82987,10 +82987,10 @@ ] }, "predict": { - "A": 0.07838255167007446, - "B": 0.7436729669570923, - "C": 0.010607924312353134, - "D": 0.1464378833770752 + "A": 0.051740773022174835, + "B": 0.4909028112888336, + "C": 0.007002352271229029, + "D": 0.4332202076911926 }, "sample": { "messages": [ @@ -83032,10 +83032,10 @@ ] }, "predict": { - "A": 0.08234778791666031, - "B": 0.0343276672065258, - "C": 0.014309900812804699, - "D": 0.7812941074371338 + "A": 0.06038431450724602, + "B": 0.01189037598669529, + "C": 0.008172128349542618, + "D": 0.8335797190666199 }, "sample": { "messages": [ @@ -83077,10 +83077,10 @@ ] }, "predict": { - "A": 0.11094280332326889, - "B": 0.2661379873752594, - "C": 0.5634140968322754, - "D": 0.011693285778164864 + "A": 0.11440029740333557, + "B": 0.24218542873859406, + "C": 0.5127065181732178, + "D": 0.01064088474959135 }, "sample": { "messages": [ @@ -83122,10 +83122,10 @@ ] }, "predict": { - "A": 0.00010794768604682758, - "B": 0.0003767744346987456, - "C": 0.0002589530195109546, - "D": 0.9911752939224243 + "A": 4.626706140697934e-05, + "B": 8.643825276521966e-05, + "C": 0.00019479160255286843, + "D": 0.9573556780815125 }, "sample": { "messages": [ @@ -83167,10 +83167,10 @@ ] }, "predict": { - "A": 0.691027045249939, - "B": 0.06427552551031113, - "C": 0.023645645007491112, - "D": 0.1979825645685196 + "A": 0.6740351319313049, + "B": 0.002145306207239628, + "C": 0.004266450647264719, + "D": 0.28097963333129883 }, "sample": { "messages": [ @@ -83212,10 +83212,10 @@ ] }, "predict": { - "A": 0.5276145339012146, - "B": 0.11772672086954117, - "C": 0.010950290597975254, - "D": 0.3200143873691559 + "A": 0.764945924282074, + "B": 0.01587594486773014, + "C": 0.0035424025263637304, + "D": 0.17068250477313995 }, "sample": { "messages": [ @@ -83257,10 +83257,10 @@ ] }, "predict": { - "A": 0.04388846084475517, - "B": 0.04388846084475517, - "C": 0.005939657799899578, - "D": 0.8815233707427979 + "A": 0.06299162656068802, + "B": 0.01165227871388197, + "C": 0.006237015128135681, + "D": 0.8695725202560425 }, "sample": { "messages": [ @@ -83295,17 +83295,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "A" ] }, "predict": { - "A": 0.258562296628952, - "B": 0.7028451561927795, - "C": 0.004179270006716251, - "D": 0.0019741475116461515 + "A": 0.4507780075073242, + "B": 0.4507780075073242, + "C": 0.0038999938406050205, + "D": 0.0036637054290622473 }, "sample": { "messages": [ @@ -83335,7 +83335,7 @@ "prompt_len": 72, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -83347,10 +83347,10 @@ ] }, "predict": { - "A": 0.0010158853838220239, - "B": 0.9831489324569702, - "C": 0.00014635217667091638, - "D": 0.0005437642685137689 + "A": 0.0009946331847459078, + "B": 0.9625815153121948, + "C": 8.691008406458423e-05, + "D": 0.0006032754899933934 }, "sample": { "messages": [ @@ -83392,10 +83392,10 @@ ] }, "predict": { - "A": 0.0011398755013942719, - "B": 0.9735206961631775, - "C": 0.00022445479407906532, - "D": 0.00022445479407906532 + "A": 0.0007015661103650928, + "B": 0.9280260801315308, + "C": 0.00020100203983020037, + "D": 0.0002747372491285205 }, "sample": { "messages": [ @@ -83437,10 +83437,10 @@ ] }, "predict": { - "A": 0.9440970420837402, - "B": 0.028509261086583138, - "C": 0.0003167092800140381, - "D": 0.0005221653264015913 + "A": 0.9271606206893921, + "B": 0.004865292459726334, + "C": 0.0001469190901843831, + "D": 0.0001772182440618053 }, "sample": { "messages": [ @@ -83482,10 +83482,10 @@ ] }, "predict": { - "A": 0.08262868970632553, - "B": 0.0007148782606236637, - "C": 0.0007148782606236637, - "D": 0.8883421421051025 + "A": 0.021544069051742554, + "B": 6.85700579197146e-05, + "C": 0.0006111582042649388, + "D": 0.9160771369934082 }, "sample": { "messages": [ @@ -83527,10 +83527,10 @@ ] }, "predict": { - "A": 0.9604474902153015, - "B": 0.007333106826990843, - "C": 0.0007729037897661328, - "D": 0.0007729037897661328 + "A": 0.8340238928794861, + "B": 0.0005227049696259201, + "C": 0.0002469083992764354, + "D": 0.00016969747957773507 }, "sample": { "messages": [ @@ -83572,10 +83572,10 @@ ] }, "predict": { - "A": 0.0050726234912872314, - "B": 0.002396137686446309, - "C": 0.0016468398971483111, - "D": 0.9666709303855896 + "A": 0.0015685883117839694, + "B": 0.0002122853184118867, + "C": 0.0004494080494623631, + "D": 0.9207383990287781 }, "sample": { "messages": [ @@ -83617,10 +83617,10 @@ ] }, "predict": { - "A": 0.001024040044285357, - "B": 0.0004268834018148482, - "C": 0.9910407662391663, - "D": 0.00017795147141441703 + "A": 0.0007933237939141691, + "B": 6.511998071800917e-05, + "C": 0.9858223795890808, + "D": 3.9497263060184196e-05 }, "sample": { "messages": [ @@ -83662,10 +83662,10 @@ ] }, "predict": { - "A": 0.7722976803779602, - "B": 0.13420520722866058, - "C": 0.04937133938074112, - "D": 0.000904267595615238 + "A": 0.8931492567062378, + "B": 0.014436413533985615, + "C": 0.007727255113422871, + "D": 0.00019344803877174854 }, "sample": { "messages": [ @@ -83707,10 +83707,10 @@ ] }, "predict": { - "A": 0.00512306671589613, - "B": 0.0010087916161864996, - "C": 0.9762836694717407, - "D": 0.00957114901393652 + "A": 0.004007960669696331, + "B": 0.0001554054906591773, + "C": 0.9807156324386597, + "D": 0.002754628425464034 }, "sample": { "messages": [ @@ -83752,10 +83752,10 @@ ] }, "predict": { - "A": 0.0024104088079184294, - "B": 0.0010048078838735819, - "C": 0.002127178246155381, - "D": 0.9724283218383789 + "A": 0.0019601862877607346, + "B": 0.0001823257189244032, + "C": 0.0010492120636627078, + "D": 0.953881561756134 }, "sample": { "messages": [ @@ -83797,10 +83797,10 @@ ] }, "predict": { - "A": 0.06623020768165588, - "B": 0.10919515788555145, - "C": 0.0032974081113934517, - "D": 0.8068491220474243 + "A": 0.04586251080036163, + "B": 0.014889377169311047, + "C": 0.0010785828344523907, + "D": 0.9211731553077698 }, "sample": { "messages": [ @@ -83842,10 +83842,10 @@ ] }, "predict": { - "A": 0.970020592212677, - "B": 0.00019736851390916854, - "C": 1.261734632862499e-05, - "D": 2.671092443051748e-05 + "A": 0.8932009935379028, + "B": 1.5880104911047965e-05, + "C": 3.5433299672149587e-06, + "D": 6.218737780727679e-06 }, "sample": { "messages": [ @@ -83887,10 +83887,10 @@ ] }, "predict": { - "A": 0.6579965949058533, - "B": 0.02251540496945381, - "C": 0.003047129139304161, - "D": 0.27429378032684326 + "A": 0.49077504873275757, + "B": 0.007932639680802822, + "C": 0.004246036056429148, + "D": 0.4331074655056 }, "sample": { "messages": [ @@ -83932,10 +83932,10 @@ ] }, "predict": { - "A": 0.08277465403079987, - "B": 0.8899114727973938, - "C": 0.006794557441025972, - "D": 0.0022058701142668724 + "A": 0.08908472210168839, + "B": 0.8452123403549194, + "C": 0.022524144500494003, + "D": 0.0036769655998796225 }, "sample": { "messages": [ @@ -83977,10 +83977,10 @@ ] }, "predict": { - "A": 0.01773650571703911, - "B": 0.003492525313049555, - "C": 0.9683804512023926, - "D": 0.00019703478028532118 + "A": 0.019954925402998924, + "B": 0.0011257798178121448, + "C": 0.9614821672439575, + "D": 8.681082545081154e-05 }, "sample": { "messages": [ @@ -84022,10 +84022,10 @@ ] }, "predict": { - "A": 0.0007938138442113996, - "B": 0.00148303946480155, - "C": 0.00054557976545766, - "D": 0.9864313006401062 + "A": 0.00022363873722497374, + "B": 0.00017417002527508885, + "C": 9.322649566456676e-05, + "D": 0.9699811935424805 }, "sample": { "messages": [ @@ -84067,10 +84067,10 @@ ] }, "predict": { - "A": 0.00037789251655340195, - "B": 0.0003334889479447156, - "C": 0.9941165447235107, - "D": 0.000229203375056386 + "A": 0.00022897605958860368, + "B": 6.560274050571024e-05, + "C": 0.993130624294281, + "D": 5.438658263301477e-05 }, "sample": { "messages": [ @@ -84112,10 +84112,10 @@ ] }, "predict": { - "A": 0.5683793425559998, - "B": 0.03206576779484749, - "C": 0.2090950906276703, - "D": 0.14370879530906677 + "A": 0.7811118960380554, + "B": 0.003192225703969598, + "C": 0.09329050779342651, + "D": 0.0126254977658391 }, "sample": { "messages": [ @@ -84157,10 +84157,10 @@ ] }, "predict": { - "A": 0.005082548595964909, - "B": 0.9685623645782471, - "C": 0.0018697652267292142, - "D": 0.0012850695056840777 + "A": 0.004364494234323502, + "B": 0.9424682259559631, + "C": 0.0012504484038800001, + "D": 0.0011035167844966054 }, "sample": { "messages": [ @@ -84202,10 +84202,10 @@ ] }, "predict": { - "A": 0.8770446181297302, - "B": 0.04947947338223457, - "C": 0.005215097684413195, - "D": 0.02337244711816311 + "A": 0.9255416393280029, + "B": 0.0022941885981708765, + "C": 0.0006572959828190506, + "D": 0.0029457963537424803 }, "sample": { "messages": [ @@ -84247,10 +84247,10 @@ ] }, "predict": { - "A": 0.01568426564335823, - "B": 0.9703511595726013, - "C": 0.0014588638441637158, - "D": 0.0006473669782280922 + "A": 0.017320360988378525, + "B": 0.9456596374511719, + "C": 0.000810083991382271, + "D": 0.0004913408192805946 }, "sample": { "messages": [ @@ -84292,10 +84292,10 @@ ] }, "predict": { - "A": 0.008504599332809448, - "B": 0.0014778777258470654, - "C": 0.9829980134963989, - "D": 0.00047979666851460934 + "A": 0.006592774763703346, + "B": 0.00014565336459781975, + "C": 0.97845458984375, + "D": 7.796262798365206e-05 }, "sample": { "messages": [ @@ -84337,10 +84337,10 @@ ] }, "predict": { - "A": 0.42666274309158325, - "B": 0.5478458404541016, - "C": 0.0004995707422494888, - "D": 0.001057591289281845 + "A": 0.18150636553764343, + "B": 0.7178716063499451, + "C": 0.00024081909214146435, + "D": 0.0007896153838373721 }, "sample": { "messages": [ @@ -84382,10 +84382,10 @@ ] }, "predict": { - "A": 0.08444108814001083, - "B": 0.1577567160129547, - "C": 0.6239399313926697, - "D": 0.10842449218034744 + "A": 0.08716226369142532, + "B": 0.23693162202835083, + "C": 0.5015842318534851, + "D": 0.11191857606172562 }, "sample": { "messages": [ @@ -84420,17 +84420,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "A" + "D" ] }, "predict": { - "A": 0.5173373818397522, - "B": 0.08989975601434708, - "C": 0.013786574825644493, - "D": 0.35556045174598694 + "A": 0.296017050743103, + "B": 0.01473782118409872, + "C": 0.018923737108707428, + "D": 0.6266680955886841 }, "sample": { "messages": [ @@ -84460,7 +84460,7 @@ "prompt_len": 81, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -84472,10 +84472,10 @@ ] }, "predict": { - "A": 0.13937589526176453, - "B": 0.8020529747009277, - "C": 0.0032778072636574507, - "D": 0.007863051258027554 + "A": 0.22386077046394348, + "B": 0.6085165739059448, + "C": 0.004100152291357517, + "D": 0.005264699924737215 }, "sample": { "messages": [ @@ -84517,10 +84517,10 @@ ] }, "predict": { - "A": 0.7085031270980835, - "B": 0.024243641644716263, - "C": 0.20298954844474792, - "D": 0.0007320945733226836 + "A": 0.687836229801178, + "B": 0.003185313893482089, + "C": 0.1534770131111145, + "D": 0.0002783296222332865 }, "sample": { "messages": [ @@ -84562,10 +84562,10 @@ ] }, "predict": { - "A": 0.050312817096710205, - "B": 0.12069418281316757, - "C": 0.007715721148997545, - "D": 0.7870249152183533 + "A": 0.11454277485609055, + "B": 0.042137935757637024, + "C": 0.010654137469828129, + "D": 0.7469127178192139 }, "sample": { "messages": [ @@ -84607,10 +84607,10 @@ ] }, "predict": { - "A": 0.057652924209833145, - "B": 0.016517840325832367, - "C": 0.0015363985439762473, - "D": 0.9018434882164001 + "A": 0.11483365297317505, + "B": 0.006478470750153065, + "C": 0.0008236451540142298, + "D": 0.8485122919082642 }, "sample": { "messages": [ @@ -84652,10 +84652,10 @@ ] }, "predict": { - "A": 0.008557829074561596, - "B": 0.8729222416877747, - "C": 0.08119441568851471, - "D": 0.005881703458726406 + "A": 0.0153953330591321, + "B": 0.654626190662384, + "C": 0.21252600848674774, + "D": 0.0030315208714455366 }, "sample": { "messages": [ @@ -84697,10 +84697,10 @@ ] }, "predict": { - "A": 0.030529338866472244, - "B": 0.050334375351667404, - "C": 0.00468182610347867, - "D": 0.8921980857849121 + "A": 0.018472852185368538, + "B": 0.034511830657720566, + "C": 0.002661266829818487, + "D": 0.8900718688964844 }, "sample": { "messages": [ @@ -84738,14 +84738,14 @@ "acc": false, "f1_macro": [ "A", - "B" + "C" ] }, "predict": { - "A": 0.09377231448888779, - "B": 0.6928889155387878, - "C": 0.15460442006587982, - "D": 0.0209234319627285 + "A": 0.27957940101623535, + "B": 0.048583611845970154, + "C": 0.5918695330619812, + "D": 0.004245188552886248 }, "sample": { "messages": [ @@ -84775,7 +84775,7 @@ "prompt_len": 85, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -84787,10 +84787,10 @@ ] }, "predict": { - "A": 0.01567341573536396, - "B": 0.0044905091635882854, - "C": 0.9696799516677856, - "D": 0.0027236314490437508 + "A": 0.0008996224496513605, + "B": 8.367792179342359e-05, + "C": 0.986555814743042, + "D": 5.402652459451929e-05 }, "sample": { "messages": [ @@ -84825,17 +84825,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "A" + "C" ] }, "predict": { - "A": 0.5366496443748474, - "B": 0.007654896005988121, - "C": 0.41794317960739136, - "D": 0.0021931645460426807 + "A": 0.11437031626701355, + "B": 0.0013524822425097227, + "C": 0.8450886607170105, + "D": 0.0002663195482455194 }, "sample": { "messages": [ @@ -84865,7 +84865,7 @@ "prompt_len": 84, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -84877,10 +84877,10 @@ ] }, "predict": { - "A": 0.1149306371808052, - "B": 0.8492289781570435, - "C": 0.003932710736989975, - "D": 0.010690215043723583 + "A": 0.01906338892877102, + "B": 0.9185255169868469, + "C": 0.0005407867138274014, + "D": 0.011562529951334 }, "sample": { "messages": [ @@ -84922,10 +84922,10 @@ ] }, "predict": { - "A": 0.19587218761444092, - "B": 0.7746895551681519, - "C": 0.0031659791711717844, - "D": 0.003587524639442563 + "A": 0.2978675663471222, + "B": 0.6305856108665466, + "C": 0.0025770594365894794, + "D": 0.004248852375894785 }, "sample": { "messages": [ @@ -84967,10 +84967,10 @@ ] }, "predict": { - "A": 0.7023493647575378, - "B": 0.12205000966787338, - "C": 0.12205000966787338, - "D": 0.006076512858271599 + "A": 0.7410870790481567, + "B": 0.010571039281785488, + "C": 0.12878161668777466, + "D": 0.003028653562068939 }, "sample": { "messages": [ @@ -85012,10 +85012,10 @@ ] }, "predict": { - "A": 0.11663704365491867, - "B": 0.86183762550354, - "C": 0.0065802112221717834, - "D": 0.001885262201540172 + "A": 0.07396342605352402, + "B": 0.9010590314865112, + "C": 0.007323371712118387, + "D": 0.0011955074733123183 }, "sample": { "messages": [ @@ -85057,10 +85057,10 @@ ] }, "predict": { - "A": 0.0024471920914947987, - "B": 0.00215963926166296, - "C": 0.9872676730155945, - "D": 0.0002579321153461933 + "A": 0.0031232505571097136, + "B": 0.00044994690688326955, + "C": 0.9812961220741272, + "D": 6.482098979176953e-05 }, "sample": { "messages": [ @@ -85102,10 +85102,10 @@ ] }, "predict": { - "A": 0.7735618948936462, - "B": 0.06349782645702362, - "C": 0.0011630032677203417, - "D": 0.15232335031032562 + "A": 0.804811418056488, + "B": 0.016703328117728233, + "C": 0.00044512859312817454, + "D": 0.15847675502300262 }, "sample": { "messages": [ @@ -85147,10 +85147,10 @@ ] }, "predict": { - "A": 0.10104622691869736, - "B": 0.8460496664047241, - "C": 0.017559200525283813, - "D": 0.01367511972784996 + "A": 0.3809514343738556, + "B": 0.5542810559272766, + "C": 0.01477108895778656, + "D": 0.01015201210975647 }, "sample": { "messages": [ @@ -85192,10 +85192,10 @@ ] }, "predict": { - "A": 0.47409358620643616, - "B": 0.47409358620643616, - "C": 0.014316385611891747, - "D": 0.000758740643505007 + "A": 0.8823129534721375, + "B": 0.038766127079725266, + "C": 0.0036058113910257816, + "D": 0.00013981247320771217 }, "sample": { "messages": [ @@ -85237,10 +85237,10 @@ ] }, "predict": { - "A": 0.006621685344725847, - "B": 0.004551013465970755, - "C": 0.9827452301979065, - "D": 0.00022658160014543682 + "A": 0.001493697171099484, + "B": 0.0005162079469300807, + "C": 0.993520200252533, + "D": 6.562847556779161e-05 }, "sample": { "messages": [ @@ -85282,10 +85282,10 @@ ] }, "predict": { - "A": 0.0005413831095211208, - "B": 0.0021412118803709745, - "C": 0.0008925899164751172, - "D": 0.9788436889648438 + "A": 0.00019548485579434782, + "B": 0.00026719612651504576, + "C": 0.00028442879556678236, + "D": 0.9607628583908081 }, "sample": { "messages": [ @@ -85327,10 +85327,10 @@ ] }, "predict": { - "A": 0.049628764390945435, - "B": 0.049628764390945435, - "C": 0.7763245105743408, - "D": 0.10506409406661987 + "A": 0.05215924233198166, + "B": 0.024638280272483826, + "C": 0.6354296207427979, + "D": 0.23376153409481049 }, "sample": { "messages": [ @@ -85372,10 +85372,10 @@ ] }, "predict": { - "A": 0.8060257434844971, - "B": 0.0147628765553236, - "C": 0.1400662511587143, - "D": 0.0047928038984537125 + "A": 0.8869649171829224, + "B": 0.0021985662169754505, + "C": 0.014336451888084412, + "D": 0.0021985662169754505 }, "sample": { "messages": [ @@ -85417,10 +85417,10 @@ ] }, "predict": { - "A": 0.9652667045593262, - "B": 0.0073699019849300385, - "C": 0.0002857621293514967, - "D": 0.000196401248103939 + "A": 0.9232151508331299, + "B": 0.0005786035326309502, + "C": 5.728951146011241e-05, + "D": 3.264257975388318e-05 }, "sample": { "messages": [ @@ -85462,10 +85462,10 @@ ] }, "predict": { - "A": 0.08725152164697647, - "B": 0.05292072147130966, - "C": 0.09886892139911652, - "D": 0.7305480241775513 + "A": 0.05915853753685951, + "B": 0.009657365269958973, + "C": 0.06703539937734604, + "D": 0.8166583776473999 }, "sample": { "messages": [ @@ -85507,10 +85507,10 @@ ] }, "predict": { - "A": 0.07892319560050964, - "B": 0.7488024830818176, - "C": 0.01995491050183773, - "D": 0.10133939236402512 + "A": 0.15010252594947815, + "B": 0.5936670303344727, + "C": 0.01792719028890133, + "D": 0.13246501982212067 }, "sample": { "messages": [ @@ -85552,10 +85552,10 @@ ] }, "predict": { - "A": 0.08861136436462402, - "B": 0.01199224404990673, - "C": 0.008242141455411911, - "D": 0.8407212495803833 + "A": 0.0211939737200737, + "B": 0.0016343037132173777, + "C": 0.0057042804546654224, + "D": 0.9011906385421753 }, "sample": { "messages": [ @@ -85597,10 +85597,10 @@ ] }, "predict": { - "A": 0.95888751745224, - "B": 0.0016335799591615796, - "C": 0.00011833612370537594, - "D": 7.177449151640758e-05 + "A": 0.8797914385795593, + "B": 0.00037896359572187066, + "C": 6.585400115000084e-05, + "D": 2.9222532248240896e-05 }, "sample": { "messages": [ @@ -85642,10 +85642,10 @@ ] }, "predict": { - "A": 0.0018871185602620244, - "B": 0.007463698275387287, - "C": 0.0006126576336100698, - "D": 0.9775516390800476 + "A": 0.000813934369944036, + "B": 0.0005594083922915161, + "C": 0.0002812882012221962, + "D": 0.9501543641090393 }, "sample": { "messages": [ @@ -85687,10 +85687,10 @@ ] }, "predict": { - "A": 0.9214575886726379, - "B": 0.00332329492084682, - "C": 0.04587667062878609, - "D": 0.00332329492084682 + "A": 0.9207668900489807, + "B": 0.00019942976359743625, + "C": 0.011590744368731976, + "D": 0.00039661346818320453 }, "sample": { "messages": [ @@ -85732,10 +85732,10 @@ ] }, "predict": { - "A": 0.00033059349516406655, - "B": 0.00022721337154507637, - "C": 0.0006176299648359418, - "D": 0.9854853749275208 + "A": 0.00015243020607158542, + "B": 4.367198926047422e-05, + "C": 0.0001345191994914785, + "D": 0.9619395136833191 }, "sample": { "messages": [ @@ -85777,10 +85777,10 @@ ] }, "predict": { - "A": 0.0003079329035244882, - "B": 0.00032779283355921507, - "C": 0.00028927618404850364, - "D": 0.9771366715431213 + "A": 0.00016573713219258934, + "B": 7.828867819625884e-05, + "C": 0.00014626250776927918, + "D": 0.9230170845985413 }, "sample": { "messages": [ @@ -85822,10 +85822,10 @@ ] }, "predict": { - "A": 0.9616259932518005, - "B": 0.001638245303183794, - "C": 5.6057666370179504e-05, - "D": 0.009427450597286224 + "A": 0.8998607397079468, + "B": 0.00015178958710748702, + "C": 2.0542487618513405e-05, + "D": 0.004722035955637693 }, "sample": { "messages": [ @@ -85867,10 +85867,10 @@ ] }, "predict": { - "A": 0.017840659245848656, - "B": 0.0014644503826275468, - "C": 0.9740669131278992, - "D": 0.00015435193199664354 + "A": 0.01779630407691002, + "B": 0.00018572108820080757, + "C": 0.9716452360153198, + "D": 8.241322939284146e-05 }, "sample": { "messages": [ @@ -85912,10 +85912,10 @@ ] }, "predict": { - "A": 0.0018968213116750121, - "B": 0.0011504802387207747, - "C": 0.0001374052808387205, - "D": 0.9825777411460876 + "A": 0.0009693517349660397, + "B": 0.00017931203183252364, + "C": 4.533718674792908e-05, + "D": 0.9381147623062134 }, "sample": { "messages": [ @@ -85957,10 +85957,10 @@ ] }, "predict": { - "A": 0.05642613396048546, - "B": 0.8826532959938049, - "C": 0.020758016034960747, - "D": 0.009805393405258656 + "A": 0.06618451327085495, + "B": 0.8062924146652222, + "C": 0.03542601689696312, + "D": 0.013032502494752407 }, "sample": { "messages": [ @@ -86002,10 +86002,10 @@ ] }, "predict": { - "A": 0.0010255236411467195, - "B": 0.0009050214430317283, - "C": 0.9924765229225159, - "D": 0.000178209287696518 + "A": 0.0003763252170756459, + "B": 0.00018922818708233535, + "C": 0.9899935126304626, + "D": 3.7261241232044995e-05 }, "sample": { "messages": [ @@ -86047,10 +86047,10 @@ ] }, "predict": { - "A": 0.002379155717790127, - "B": 0.9598199129104614, - "C": 7.184428250184283e-05, - "D": 4.093563256901689e-05 + "A": 0.0022934405133128166, + "B": 0.9252399206161499, + "C": 7.847721280995756e-05, + "D": 4.4714972318615764e-05 }, "sample": { "messages": [ @@ -86092,10 +86092,10 @@ ] }, "predict": { - "A": 0.00017496054351795465, - "B": 0.9743837714195251, - "C": 2.6831070499611087e-05, - "D": 9.364962897961959e-05 + "A": 0.00013151847815606743, + "B": 0.8835007548332214, + "C": 1.89470338227693e-05, + "D": 0.00013151847815606743 }, "sample": { "messages": [ @@ -86137,10 +86137,10 @@ ] }, "predict": { - "A": 0.0006181906792335212, - "B": 0.00022741964494343847, - "C": 2.396985473751556e-05, - "D": 0.9863800406455994 + "A": 0.0004645955341402441, + "B": 5.212616815697402e-05, + "C": 1.0926239156106021e-05, + "D": 0.9518541693687439 }, "sample": { "messages": [ @@ -86182,10 +86182,10 @@ ] }, "predict": { - "A": 0.002781444927677512, - "B": 0.0005830224836245179, - "C": 0.9902628660202026, - "D": 0.0005476989317685366 + "A": 0.0007996620843186975, + "B": 4.802344483323395e-05, + "C": 0.9936985969543457, + "D": 7.917726179584861e-05 }, "sample": { "messages": [ @@ -86227,10 +86227,10 @@ ] }, "predict": { - "A": 0.01990838535130024, - "B": 0.9592397212982178, - "C": 0.0014421585947275162, - "D": 0.002098328433930874 + "A": 0.023735327646136284, + "B": 0.8906615972518921, + "C": 0.0016152092721313238, + "D": 0.0019483143696561456 }, "sample": { "messages": [ @@ -86272,10 +86272,10 @@ ] }, "predict": { - "A": 0.0049434625543653965, - "B": 0.9420572519302368, - "C": 0.0003811989154201001, - "D": 0.02844766341149807 + "A": 0.007913121022284031, + "B": 0.9146324396133423, + "C": 0.0004193800559733063, + "D": 0.027619507163763046 }, "sample": { "messages": [ @@ -86310,17 +86310,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "C", - "C" + "A" ] }, "predict": { - "A": 0.33330222964286804, - "B": 0.07436977326869965, - "C": 0.5495224595069885, - "D": 0.012923529371619225 + "A": 0.707568347454071, + "B": 0.01664041355252266, + "C": 0.1789012998342514, + "D": 0.008906971663236618 }, "sample": { "messages": [ @@ -86350,7 +86350,7 @@ "prompt_len": 99, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -86362,10 +86362,10 @@ ] }, "predict": { - "A": 0.017374657094478607, - "B": 0.005640724673867226, - "C": 0.0016160949598997831, - "D": 0.9486240744590759 + "A": 0.014991417527198792, + "B": 0.0015800839755684137, + "C": 0.0007945160614326596, + "D": 0.9274861812591553 }, "sample": { "messages": [ @@ -86407,10 +86407,10 @@ ] }, "predict": { - "A": 9.453547681914642e-05, - "B": 0.9836005568504333, - "C": 0.00012138595047872514, - "D": 0.0011516778031364083 + "A": 3.6658981116488576e-05, + "B": 0.9149810075759888, + "C": 5.6778546422719955e-05, + "D": 0.0002708753163460642 }, "sample": { "messages": [ @@ -86452,10 +86452,10 @@ ] }, "predict": { - "A": 0.002264206763356924, - "B": 0.005102468654513359, - "C": 0.9723584055900574, - "D": 0.002410235581919551 + "A": 0.0010869933757930994, + "B": 0.000546574592590332, + "C": 0.9882301092147827, + "D": 0.0007952614105306566 }, "sample": { "messages": [ @@ -86497,10 +86497,10 @@ ] }, "predict": { - "A": 0.806004524230957, - "B": 0.10908084362745285, - "C": 0.03541336581110954, - "D": 0.002565335715189576 + "A": 0.8202260136604309, + "B": 0.01928986795246601, + "C": 0.046273987740278244, + "D": 0.0008475375943817198 }, "sample": { "messages": [ @@ -86542,10 +86542,10 @@ ] }, "predict": { - "A": 0.1353422999382019, - "B": 0.7788411974906921, - "C": 0.012588795274496078, - "D": 0.0387762188911438 + "A": 0.22301092743873596, + "B": 0.606206476688385, + "C": 0.011103060096502304, + "D": 0.03419983386993408 }, "sample": { "messages": [ @@ -86587,10 +86587,10 @@ ] }, "predict": { - "A": 0.002392655471339822, - "B": 0.003072230378165841, - "C": 0.0006855071987956762, - "D": 0.9652660489082336 + "A": 0.000603904016315937, + "B": 0.001201006700284779, + "C": 0.0004150567692704499, + "D": 0.9635843634605408 }, "sample": { "messages": [ @@ -86632,10 +86632,10 @@ ] }, "predict": { - "A": 0.00029110885225236416, - "B": 0.000698333780746907, - "C": 0.00012135221186326817, - "D": 0.9833272099494934 + "A": 0.0001047717742039822, + "B": 9.842396684689447e-05, + "C": 5.969715130049735e-05, + "D": 0.9620140790939331 }, "sample": { "messages": [ @@ -86677,10 +86677,10 @@ ] }, "predict": { - "A": 0.12593647837638855, - "B": 0.021884478628635406, - "C": 0.0020355735905468464, - "D": 0.8212090134620667 + "A": 0.20694351196289062, + "B": 0.004571977537125349, + "C": 0.0010201460681855679, + "D": 0.7223038077354431 }, "sample": { "messages": [ @@ -86722,10 +86722,10 @@ ] }, "predict": { - "A": 0.9088725447654724, - "B": 0.03524081036448479, - "C": 0.00891027320176363, - "D": 0.003714354243129492 + "A": 0.8458120822906494, + "B": 0.003050474915653467, + "C": 0.002692034700885415, + "D": 0.0004678054538089782 }, "sample": { "messages": [ @@ -86767,10 +86767,10 @@ ] }, "predict": { - "A": 0.0010118901263922453, - "B": 0.9792823195457458, - "C": 0.00025584586546756327, - "D": 0.008472452871501446 + "A": 0.00026577094104140997, + "B": 0.9556383490562439, + "C": 4.916268517263234e-05, + "D": 0.0014367441181093454 }, "sample": { "messages": [ @@ -86812,10 +86812,10 @@ ] }, "predict": { - "A": 0.30045032501220703, - "B": 0.0116497240960598, - "C": 0.6360533237457275, - "D": 0.00048084152513183653 + "A": 0.08173112571239471, + "B": 0.0014969579642638564, + "C": 0.8786924481391907, + "D": 0.00014821889635641128 }, "sample": { "messages": [ @@ -86857,10 +86857,10 @@ ] }, "predict": { - "A": 0.963955283164978, - "B": 0.0014492481714114547, - "C": 0.00010498319170437753, - "D": 0.0008790134452283382 + "A": 0.8348919749259949, + "B": 0.00016987411072477698, + "C": 5.515004522749223e-05, + "D": 0.00021812265913467854 }, "sample": { "messages": [ @@ -86895,17 +86895,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "B", - "B" + "C" ] }, "predict": { - "A": 0.061378151178359985, - "B": 0.7477389574050903, - "C": 0.14723852276802063, - "D": 0.008306629955768585 + "A": 0.20359712839126587, + "B": 0.29623207449913025, + "C": 0.38036948442459106, + "D": 0.004498045891523361 }, "sample": { "messages": [ @@ -86935,7 +86935,7 @@ "prompt_len": 70, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -86947,10 +86947,10 @@ ] }, "predict": { - "A": 0.015889452770352364, - "B": 0.8675347566604614, - "C": 0.0021504038013517857, - "D": 0.08069329708814621 + "A": 0.012287230230867863, + "B": 0.8614013195037842, + "C": 0.0011428904253989458, + "D": 0.04859694093465805 }, "sample": { "messages": [ @@ -86992,10 +86992,10 @@ ] }, "predict": { - "A": 0.022053547203540802, - "B": 0.30443984270095825, - "C": 0.6444991230964661, - "D": 0.0033820210956037045 + "A": 0.026795556768774986, + "B": 0.2880793809890747, + "C": 0.6098640561103821, + "D": 0.001712982659228146 }, "sample": { "messages": [ @@ -87037,10 +87037,10 @@ ] }, "predict": { - "A": 0.017318345606327057, - "B": 0.01528338622301817, - "C": 0.9455496072769165, - "D": 0.008180607110261917 + "A": 0.006612409371882677, + "B": 0.0024325696285814047, + "C": 0.9813685417175293, + "D": 0.002285187365487218 }, "sample": { "messages": [ @@ -87082,10 +87082,10 @@ ] }, "predict": { - "A": 0.044283363968133926, - "B": 0.8894551396369934, - "C": 0.0032078761141747236, - "D": 0.044283363968133926 + "A": 0.023901881650090218, + "B": 0.8969115018844604, + "C": 0.0012667534174397588, + "D": 0.014497224241495132 }, "sample": { "messages": [ @@ -87127,10 +87127,10 @@ ] }, "predict": { - "A": 0.00042712214053608477, - "B": 0.00042712214053608477, - "C": 0.9915949702262878, - "D": 0.00029355648439377546 + "A": 8.299136970890686e-05, + "B": 9.404154116054997e-05, + "C": 0.9784613847732544, + "D": 7.796317368047312e-05 }, "sample": { "messages": [ @@ -87172,10 +87172,10 @@ ] }, "predict": { - "A": 0.9511998891830444, - "B": 0.008229491300880909, - "C": 0.0004097222408745438, - "D": 0.0012620333582162857 + "A": 0.9030200242996216, + "B": 0.0006024471949785948, + "C": 7.65925578889437e-05, + "D": 0.000221627953578718 }, "sample": { "messages": [ @@ -87217,10 +87217,10 @@ ] }, "predict": { - "A": 0.8435937762260437, - "B": 0.012033218517899513, - "C": 0.002369481371715665, - "D": 0.08891412615776062 + "A": 0.9213665723800659, + "B": 0.0013852185802534223, + "C": 0.001013447530567646, + "D": 0.007971382699906826 }, "sample": { "messages": [ @@ -87262,10 +87262,10 @@ ] }, "predict": { - "A": 0.007350820582360029, - "B": 0.0011272848350927234, - "C": 0.00016240078548435122, - "D": 0.9627675414085388 + "A": 0.0020634233951568604, + "B": 0.0002972643414977938, + "C": 5.8534813433652744e-05, + "D": 0.9432830810546875 }, "sample": { "messages": [ @@ -87307,10 +87307,10 @@ ] }, "predict": { - "A": 0.0030852274503558874, - "B": 0.0018712850287556648, - "C": 0.0030852274503558874, - "D": 0.9693496227264404 + "A": 0.0017364659579470754, + "B": 0.0006388101610355079, + "C": 0.0019676736555993557, + "D": 0.9575251936912537 }, "sample": { "messages": [ @@ -87352,10 +87352,10 @@ ] }, "predict": { - "A": 0.7204935550689697, - "B": 0.20642487704753876, - "C": 0.016944386065006256, - "D": 0.01027728896588087 + "A": 0.8957526683807373, + "B": 0.011275862343609333, + "C": 0.004700478632003069, + "D": 0.0018407338066026568 }, "sample": { "messages": [ @@ -87397,10 +87397,10 @@ ] }, "predict": { - "A": 0.030337149277329445, - "B": 0.690470278263092, - "C": 0.005973738618195057, - "D": 0.25400984287261963 + "A": 0.0380062535405159, + "B": 0.6736769080162048, + "C": 0.006604496389627457, + "D": 0.24783191084861755 }, "sample": { "messages": [ @@ -87442,10 +87442,10 @@ ] }, "predict": { - "A": 0.008470204658806324, - "B": 0.0014719008468091488, - "C": 0.9790225625038147, - "D": 0.0007878518081270158 + "A": 0.005134296137839556, + "B": 0.00032822455978021026, + "C": 0.9784236550331116, + "D": 0.00024013423535507172 }, "sample": { "messages": [ @@ -87487,10 +87487,10 @@ ] }, "predict": { - "A": 0.00012108425289625302, - "B": 0.9811558723449707, - "C": 7.344131881836802e-05, - "D": 0.00017617654521018267 + "A": 0.00016329914797097445, + "B": 0.9094395637512207, + "C": 7.713705417700112e-05, + "D": 0.00025292267673648894 }, "sample": { "messages": [ @@ -87528,14 +87528,14 @@ "acc": false, "f1_macro": [ "D", - "A" + "C" ] }, "predict": { - "A": 0.656072735786438, - "B": 0.10061202198266983, - "C": 0.06914956122636795, - "D": 0.12918837368488312 + "A": 0.2306137979030609, + "B": 0.008941865526139736, + "C": 0.6268731951713562, + "D": 0.08483806252479553 }, "sample": { "messages": [ @@ -87565,7 +87565,7 @@ "prompt_len": 73, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -87577,10 +87577,10 @@ ] }, "predict": { - "A": 0.002721544820815325, - "B": 0.9689369797706604, - "C": 0.009499124251306057, - "D": 0.003494532546028495 + "A": 0.00026161715504713356, + "B": 0.940702497959137, + "C": 0.0020577784162014723, + "D": 0.0023317683953791857 }, "sample": { "messages": [ @@ -87622,10 +87622,10 @@ ] }, "predict": { - "A": 0.05015353858470917, - "B": 0.8889927268028259, - "C": 0.0028294690418988466, - "D": 0.03041965700685978 + "A": 0.030166709795594215, + "B": 0.881600558757782, + "C": 0.00360289984382689, + "D": 0.006731102708727121 }, "sample": { "messages": [ @@ -87667,10 +87667,10 @@ ] }, "predict": { - "A": 0.00962025299668312, - "B": 0.003123238915577531, - "C": 0.9812924265861511, - "D": 0.0010139672085642815 + "A": 0.009627263993024826, + "B": 0.0018957206048071384, + "C": 0.9820075631141663, + "D": 0.0005102262366563082 }, "sample": { "messages": [ @@ -87712,10 +87712,10 @@ ] }, "predict": { - "A": 0.001467101275920868, - "B": 0.9758301973342896, - "C": 0.0008898419328033924, - "D": 0.0002249872632091865 + "A": 0.0013942994410172105, + "B": 0.9274066090583801, + "C": 0.00048185698688030243, + "D": 0.00015643605729565024 }, "sample": { "messages": [ @@ -87757,10 +87757,10 @@ ] }, "predict": { - "A": 0.44259005784988403, - "B": 0.0220352616161108, - "C": 0.5015202164649963, - "D": 0.006313208024948835 + "A": 0.4362112283706665, + "B": 0.006222219206392765, + "C": 0.49429208040237427, + "D": 0.006222219206392765 }, "sample": { "messages": [ @@ -87802,10 +87802,10 @@ ] }, "predict": { - "A": 0.0006127784145064652, - "B": 0.9777442812919617, - "C": 0.00028945604572072625, - "D": 0.0002254285936942324 + "A": 0.0010668720351532102, + "B": 0.9111714363098145, + "C": 0.00039248031680472195, + "D": 0.0002697475138120353 }, "sample": { "messages": [ @@ -87847,10 +87847,10 @@ ] }, "predict": { - "A": 0.001109755365177989, - "B": 0.9477963447570801, - "C": 0.0001598754315637052, - "D": 0.00024761990061961114 + "A": 0.0010326748015359044, + "B": 0.8285293579101562, + "C": 0.00013128985301591456, + "D": 0.0001794519484974444 }, "sample": { "messages": [ @@ -87892,10 +87892,10 @@ ] }, "predict": { - "A": 0.9644307494163513, - "B": 0.0018617893802002072, - "C": 0.0006849140045233071, - "D": 0.009454946964979172 + "A": 0.9112565517425537, + "B": 0.00017417834897059947, + "C": 0.00014439891674555838, + "D": 0.0032865044195204973 }, "sample": { "messages": [ @@ -87937,10 +87937,10 @@ ] }, "predict": { - "A": 0.0012699234066531062, - "B": 0.9571465849876404, - "C": 5.5796539527364075e-05, - "D": 5.5796539527364075e-05 + "A": 0.0005542159778997302, + "B": 0.8307253122329712, + "C": 3.771487172343768e-05, + "D": 7.500498031731695e-05 }, "sample": { "messages": [ @@ -87982,10 +87982,10 @@ ] }, "predict": { - "A": 0.009479885920882225, - "B": 0.9669745564460754, - "C": 0.0005024155252613127, - "D": 0.002396890427917242 + "A": 0.019123850390315056, + "B": 0.9214387536048889, + "C": 0.0008944344008341432, + "D": 0.007035271264612675 }, "sample": { "messages": [ @@ -88027,10 +88027,10 @@ ] }, "predict": { - "A": 0.48774290084838867, - "B": 0.48774290084838867, - "C": 0.0015523787587881088, - "D": 0.00225869775749743 + "A": 0.8600871562957764, + "B": 0.08000056445598602, + "C": 0.000394369795685634, + "D": 0.0009460438741371036 }, "sample": { "messages": [ @@ -88072,10 +88072,10 @@ ] }, "predict": { - "A": 0.9563960433006287, - "B": 0.007302173413336277, - "C": 0.0030439989641308784, - "D": 0.0023706688079982996 + "A": 0.9288272857666016, + "B": 0.0013964353129267693, + "C": 0.0013118296628817916, + "D": 0.0006196644390001893 }, "sample": { "messages": [ @@ -88117,10 +88117,10 @@ ] }, "predict": { - "A": 0.08692284673452377, - "B": 0.06769558042287827, - "C": 0.0015920475125312805, - "D": 0.8247010111808777 + "A": 0.08888262510299683, + "B": 0.006438635755330324, + "C": 0.00041160828550346196, + "D": 0.8432949185371399 }, "sample": { "messages": [ @@ -88162,10 +88162,10 @@ ] }, "predict": { - "A": 0.00047274742973968387, - "B": 0.9685556292533875, - "C": 3.8805465010227636e-05, - "D": 1.5196441381704062e-05 + "A": 0.00027840383700095117, + "B": 0.8834346532821655, + "C": 3.5395070881349966e-05, + "D": 1.3860910257790238e-05 }, "sample": { "messages": [ @@ -88207,10 +88207,10 @@ ] }, "predict": { - "A": 0.019733889028429985, - "B": 0.01741509698331356, - "C": 0.9508320093154907, - "D": 0.0009229662246070802 + "A": 0.007500788662582636, + "B": 0.0021490121725946665, + "C": 0.9824094772338867, + "D": 0.00018777856894303113 }, "sample": { "messages": [ @@ -88252,10 +88252,10 @@ ] }, "predict": { - "A": 2.3766866434016265e-05, - "B": 0.00015497942513320595, - "C": 0.000787050521466881, - "D": 0.9780268669128418 + "A": 5.483040240505943e-06, + "B": 2.1685844330932014e-05, + "C": 0.0007181365508586168, + "D": 0.8923910856246948 }, "sample": { "messages": [ @@ -88297,10 +88297,10 @@ ] }, "predict": { - "A": 0.11740347743034363, - "B": 0.08069014549255371, - "C": 0.004552226513624191, - "D": 0.7655668258666992 + "A": 0.18700110912322998, + "B": 0.015349986031651497, + "C": 0.0019515317399054766, + "D": 0.7396037578582764 }, "sample": { "messages": [ @@ -88342,10 +88342,10 @@ ] }, "predict": { - "A": 0.009250862523913383, - "B": 0.009250862523913383, - "C": 0.017282886430621147, - "D": 0.943613588809967 + "A": 0.007976771332323551, + "B": 0.0027566985227167606, + "C": 0.019135279580950737, + "D": 0.921989381313324 }, "sample": { "messages": [ @@ -88387,10 +88387,10 @@ ] }, "predict": { - "A": 0.4471709132194519, - "B": 0.5067110061645508, - "C": 0.007227844092994928, - "D": 0.009280736558139324 + "A": 0.15339434146881104, + "B": 0.7790008187294006, + "C": 0.0021880532149225473, + "D": 0.00524886604398489 }, "sample": { "messages": [ @@ -88432,10 +88432,10 @@ ] }, "predict": { - "A": 0.0024143776390701532, - "B": 0.005791790317744017, - "C": 0.0016593759646639228, - "D": 0.9740293622016907 + "A": 0.0020576249808073044, + "B": 0.0014141835272312164, + "C": 0.0008577456464990973, + "D": 0.9406323432922363 }, "sample": { "messages": [ @@ -88477,10 +88477,10 @@ ] }, "predict": { - "A": 0.033282212913036346, - "B": 0.8583596348762512, - "C": 0.012243842706084251, - "D": 0.07983987778425217 + "A": 0.19665391743183136, + "B": 0.6863895654678345, + "C": 0.004923134110867977, + "D": 0.08197754621505737 }, "sample": { "messages": [ @@ -88522,10 +88522,10 @@ ] }, "predict": { - "A": 0.0012967705260962248, - "B": 0.0005754384328611195, - "C": 0.0003715310012921691, - "D": 0.9773814678192139 + "A": 0.00044281187001615763, + "B": 4.968211578670889e-05, + "C": 0.00012686772970482707, + "D": 0.9657352566719055 }, "sample": { "messages": [ @@ -88567,10 +88567,10 @@ ] }, "predict": { - "A": 0.9634119272232056, - "B": 0.007355740759521723, - "C": 0.0012782361591234803, - "D": 0.0006841904832981527 + "A": 0.921974778175354, + "B": 0.0013861330226063728, + "C": 0.00039713375736027956, + "D": 0.0001004111472866498 }, "sample": { "messages": [ @@ -88612,10 +88612,10 @@ ] }, "predict": { - "A": 0.08319971710443497, - "B": 0.789376974105835, - "C": 0.06479600816965103, - "D": 0.030607465654611588 + "A": 0.0394238717854023, + "B": 0.8972831964492798, + "C": 0.009967916645109653, + "D": 0.012799056246876717 }, "sample": { "messages": [ @@ -88657,10 +88657,10 @@ ] }, "predict": { - "A": 0.06964156776666641, - "B": 0.002874450758099556, - "C": 0.6607407927513123, - "D": 0.2430729866027832 + "A": 0.005814713425934315, + "B": 0.00037172221345826983, + "C": 0.9778844714164734, + "D": 0.007466239854693413 }, "sample": { "messages": [ @@ -88702,10 +88702,10 @@ ] }, "predict": { - "A": 0.007305169478058815, - "B": 0.0023716415744274855, - "C": 0.015465044416487217, - "D": 0.9567884206771851 + "A": 0.010396338067948818, + "B": 0.0005176031845621765, + "C": 0.004910882096737623, + "D": 0.9358484745025635 }, "sample": { "messages": [ @@ -88747,10 +88747,10 @@ ] }, "predict": { - "A": 0.022313283756375313, - "B": 0.001426440430805087, - "C": 0.011943439953029156, - "D": 0.9487849473953247 + "A": 0.011652584187686443, + "B": 0.00035187756293453276, + "C": 0.008008696138858795, + "D": 0.9256794452667236 }, "sample": { "messages": [ @@ -88792,10 +88792,10 @@ ] }, "predict": { - "A": 0.9500854015350342, - "B": 0.0014283956261351705, - "C": 0.0002482179261278361, - "D": 0.0003187181428074837 + "A": 0.8664365410804749, + "B": 0.00024096318520605564, + "C": 7.822929183021188e-05, + "D": 0.00012116390280425549 }, "sample": { "messages": [ @@ -88837,10 +88837,10 @@ ] }, "predict": { - "A": 0.08872430771589279, - "B": 0.05381401628255844, - "C": 0.0015265861293300986, - "D": 0.8417928218841553 + "A": 0.10361220687627792, + "B": 0.014022386632859707, + "C": 0.0005787730333395302, + "D": 0.8675343990325928 }, "sample": { "messages": [ @@ -88882,10 +88882,10 @@ ] }, "predict": { - "A": 0.0021405071020126343, - "B": 0.978521466255188, - "C": 0.00012075914128217846, - "D": 0.0011457307264208794 + "A": 0.0009631068096496165, + "B": 0.9320710897445679, + "C": 6.976722943363711e-05, + "D": 0.0005155139369890094 }, "sample": { "messages": [ @@ -88927,10 +88927,10 @@ ] }, "predict": { - "A": 0.04646877199411392, - "B": 0.9333502650260925, - "C": 0.0015900741564109921, - "D": 0.0015900741564109921 + "A": 0.11005967855453491, + "B": 0.813237190246582, + "C": 0.0006544388015754521, + "D": 0.0007894037989899516 }, "sample": { "messages": [ @@ -88972,10 +88972,10 @@ ] }, "predict": { - "A": 0.06846282631158829, - "B": 0.06846282631158829, - "C": 0.8340480327606201, - "D": 0.010499115101993084 + "A": 0.06328894942998886, + "B": 0.029895583167672157, + "C": 0.8736770153045654, + "D": 0.009705675765872002 }, "sample": { "messages": [ @@ -89017,10 +89017,10 @@ ] }, "predict": { - "A": 0.002449250314384699, - "B": 0.002449250314384699, - "C": 0.9880980849266052, - "D": 0.0010209993924945593 + "A": 0.0021706633269786835, + "B": 0.0004549957229755819, + "C": 0.9923072457313538, + "D": 0.0003127137024421245 }, "sample": { "messages": [ @@ -89062,10 +89062,10 @@ ] }, "predict": { - "A": 0.009400752373039722, - "B": 0.9589027762413025, - "C": 0.0020975912921130657, - "D": 0.004440600983798504 + "A": 0.010123622603714466, + "B": 0.911299467086792, + "C": 0.006957856938242912, + "D": 0.003724272595718503 }, "sample": { "messages": [ @@ -89107,10 +89107,10 @@ ] }, "predict": { - "A": 0.010860217735171318, - "B": 0.0035257963463664055, - "C": 0.9776056408882141, - "D": 0.0011446584248915315 + "A": 0.00848687905818224, + "B": 0.002755286404863, + "C": 0.9809498190879822, + "D": 0.0005096766981296241 }, "sample": { "messages": [ @@ -89152,10 +89152,10 @@ ] }, "predict": { - "A": 0.9155176281929016, - "B": 0.004239688161760569, - "C": 0.000506358512211591, - "D": 0.0009460021974518895 + "A": 0.879473865032196, + "B": 0.0005177948623895645, + "C": 0.00013936258619651198, + "D": 0.00013936258619651198 }, "sample": { "messages": [ @@ -89197,10 +89197,10 @@ ] }, "predict": { - "A": 0.004024619702249765, - "B": 0.004024619702249765, - "C": 0.9847919940948486, - "D": 0.00010075438331114128 + "A": 0.002160316100344062, + "B": 0.0005814411561004817, + "C": 0.9875770807266235, + "D": 4.772760075866245e-05 }, "sample": { "messages": [ @@ -89242,10 +89242,10 @@ ] }, "predict": { - "A": 0.8438082933425903, - "B": 0.0327179990708828, - "C": 0.0692640021443367, - "D": 0.013638890348374844 + "A": 0.8841579556465149, + "B": 0.005957409739494324, + "C": 0.0388471893966198, + "D": 0.0033944298047572374 }, "sample": { "messages": [ @@ -89287,10 +89287,10 @@ ] }, "predict": { - "A": 0.0024372749030590057, - "B": 0.006625199690461159, - "C": 0.9832668304443359, - "D": 0.00022670185717288405 + "A": 0.0013109436258673668, + "B": 0.001907411846332252, + "C": 0.9880637526512146, + "D": 8.380581857636571e-05 }, "sample": { "messages": [ @@ -89332,10 +89332,10 @@ ] }, "predict": { - "A": 0.0018397636013105512, - "B": 0.003033257322385907, - "C": 0.0020847253035753965, - "D": 0.9530211091041565 + "A": 0.00034132800647057593, + "B": 0.0002070258924504742, + "C": 0.006050183903425932, + "D": 0.8979268670082092 }, "sample": { "messages": [ @@ -89377,10 +89377,10 @@ ] }, "predict": { - "A": 0.01965206116437912, - "B": 0.02226870320737362, - "C": 0.946889340877533, - "D": 0.002659617457538843 + "A": 0.003999736160039902, + "B": 0.0035297549329698086, + "C": 0.9787032008171082, + "D": 0.0004215691005811095 }, "sample": { "messages": [ @@ -89422,10 +89422,10 @@ ] }, "predict": { - "A": 0.0007865748484618962, - "B": 0.0014695152640342712, - "C": 0.004526426084339619, - "D": 0.9774357676506042 + "A": 0.0002622758038341999, + "B": 0.00016933797451201826, + "C": 0.002648891881108284, + "D": 0.9430707693099976 }, "sample": { "messages": [ @@ -89467,10 +89467,10 @@ ] }, "predict": { - "A": 0.9660345315933228, - "B": 0.01377974171191454, - "C": 0.0014523741556331515, - "D": 0.0021131918765604496 + "A": 0.9216996431350708, + "B": 0.0013857193989679217, + "C": 0.0002563323068898171, + "D": 0.0006545674987137318 }, "sample": { "messages": [ @@ -89512,10 +89512,10 @@ ] }, "predict": { - "A": 0.9686060547828674, - "B": 0.001456240308471024, - "C": 3.880748772644438e-05, - "D": 8.215545676648617e-05 + "A": 0.816236674785614, + "B": 0.0006568526150658727, + "C": 4.469937994144857e-05, + "D": 7.369682134594768e-05 }, "sample": { "messages": [ @@ -89557,10 +89557,10 @@ ] }, "predict": { - "A": 0.08326389640569687, - "B": 0.008775951340794563, - "C": 0.8951712846755981, - "D": 0.00020639057038351893 + "A": 0.05159325897693634, + "B": 0.002000487642362714, + "C": 0.9145123958587646, + "D": 9.95984228211455e-05 }, "sample": { "messages": [ @@ -89602,10 +89602,10 @@ ] }, "predict": { - "A": 0.000895123288501054, - "B": 0.00018762802937999368, - "C": 0.0016723106382414699, - "D": 0.9816219210624695 + "A": 0.0007649622275494039, + "B": 5.2056333515793085e-05, + "C": 0.0007186155417002738, + "D": 0.9505789279937744 }, "sample": { "messages": [ @@ -89640,17 +89640,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "C" ] }, "predict": { - "A": 0.43897533416748047, - "B": 0.2662520110607147, - "C": 0.2662520110607147, - "D": 0.0029577924869954586 + "A": 0.30058157444000244, + "B": 0.26526230573654175, + "C": 0.3406035304069519, + "D": 0.0016790349036455154 }, "sample": { "messages": [ @@ -89680,7 +89680,7 @@ "prompt_len": 93, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -89692,10 +89692,10 @@ ] }, "predict": { - "A": 0.832125186920166, - "B": 0.14460165798664093, - "C": 0.0008598329732194543, - "D": 0.0006696385680697858 + "A": 0.9259517192840576, + "B": 0.03168432414531708, + "C": 0.00016626408614683896, + "D": 0.00012164140935055912 }, "sample": { "messages": [ @@ -89737,10 +89737,10 @@ ] }, "predict": { - "A": 0.548894464969635, - "B": 0.021282948553562164, - "C": 0.3772493004798889, - "D": 0.014627542346715927 + "A": 0.8714791536331177, + "B": 0.0019063529325649142, + "C": 0.02982037514448166, + "D": 0.0027737270575016737 }, "sample": { "messages": [ @@ -89782,10 +89782,10 @@ ] }, "predict": { - "A": 0.8810733556747437, - "B": 0.0015010142233222723, - "C": 0.08195258677005768, - "D": 0.0028042634949088097 + "A": 0.8966694474220276, + "B": 0.00015125128265935928, + "C": 0.007757710758596659, + "D": 0.0004959338693879545 }, "sample": { "messages": [ @@ -89827,10 +89827,10 @@ ] }, "predict": { - "A": 0.000692663190420717, - "B": 0.9753423929214478, - "C": 0.00032719093724153936, - "D": 0.00019845133647322655 + "A": 0.0013232494238764048, + "B": 0.8801482915878296, + "C": 0.00024477654369547963, + "D": 0.0001020380441332236 }, "sample": { "messages": [ @@ -89872,10 +89872,10 @@ ] }, "predict": { - "A": 0.0021119581069797277, - "B": 0.004471015650779009, - "C": 0.9654704928398132, - "D": 0.00032387927058152854 + "A": 0.00028652881155721843, + "B": 0.00019692817295435816, + "C": 0.9678564071655273, + "D": 4.3940613977611065e-05 }, "sample": { "messages": [ @@ -89917,10 +89917,10 @@ ] }, "predict": { - "A": 0.9217946529388428, - "B": 0.009036957286298275, - "C": 0.0025891317054629326, - "D": 0.016883257776498795 + "A": 0.8909555077552795, + "B": 0.007708275690674782, + "C": 0.0011821023654192686, + "D": 0.0046753054484725 }, "sample": { "messages": [ @@ -89962,10 +89962,10 @@ ] }, "predict": { - "A": 0.497733473777771, - "B": 0.03605569526553154, - "C": 0.03181903436779976, - "D": 0.3876352310180664 + "A": 0.4090065062046051, + "B": 0.02962833270430565, + "C": 0.04884885996580124, + "D": 0.4090065062046051 }, "sample": { "messages": [ @@ -90007,10 +90007,10 @@ ] }, "predict": { - "A": 0.045813627541065216, - "B": 0.9201913475990295, - "C": 0.0029287675861269236, - "D": 0.011583499610424042 + "A": 0.07891722768545151, + "B": 0.8484401702880859, + "C": 0.005370385013520718, + "D": 0.0073404461145401 }, "sample": { "messages": [ @@ -90052,10 +90052,10 @@ ] }, "predict": { - "A": 0.013798278756439686, - "B": 0.004479645751416683, - "C": 0.9673340916633606, - "D": 0.001454327953979373 + "A": 0.01079163234680891, + "B": 0.0007817431469447911, + "C": 0.9714316725730896, + "D": 0.00047415122389793396 }, "sample": { "messages": [ @@ -90097,10 +90097,10 @@ ] }, "predict": { - "A": 0.9036678671836853, - "B": 0.05098145455121994, - "C": 0.021252231672406197, - "D": 0.005373405292630196 + "A": 0.9252254366874695, + "B": 0.0033368838485330343, + "C": 0.010278326459228992, + "D": 0.0013910201378166676 }, "sample": { "messages": [ @@ -90142,10 +90142,10 @@ ] }, "predict": { - "A": 0.017692964524030685, - "B": 0.00032405793899670243, - "C": 7.230710616568103e-05, - "D": 0.9660031199455261 + "A": 0.004358586389571428, + "B": 0.00015876148245297372, + "C": 7.044999074423686e-05, + "D": 0.9411925077438354 }, "sample": { "messages": [ @@ -90187,10 +90187,10 @@ ] }, "predict": { - "A": 0.000788235105574131, - "B": 0.9794989228248596, - "C": 0.000372335925931111, - "D": 0.00019929705013055354 + "A": 0.0005477408412843943, + "B": 0.8739708065986633, + "C": 0.0002430585300317034, + "D": 0.00014742245548404753 }, "sample": { "messages": [ @@ -90232,10 +90232,10 @@ ] }, "predict": { - "A": 0.015561564825475216, - "B": 0.9627599120140076, - "C": 0.000683727441355586, - "D": 0.0016401769826188684 + "A": 0.021322092041373253, + "B": 0.9066383838653564, + "C": 0.0006048611830919981, + "D": 0.004198568873107433 }, "sample": { "messages": [ @@ -90277,10 +90277,10 @@ ] }, "predict": { - "A": 0.015282954089343548, - "B": 0.9455228447914124, - "C": 0.011902377009391785, - "D": 0.005622284486889839 + "A": 0.023838963359594345, + "B": 0.8945505023002625, + "C": 0.016384264454245567, + "D": 0.005319191608577967 }, "sample": { "messages": [ @@ -90322,10 +90322,10 @@ ] }, "predict": { - "A": 0.0024273470044136047, - "B": 0.009600344114005566, - "C": 0.9792616367340088, - "D": 0.00032850567367859185 + "A": 0.0011574695818126202, + "B": 0.001792724011465907, + "C": 0.9885470867156982, + "D": 9.501089516561478e-05 }, "sample": { "messages": [ @@ -90367,10 +90367,10 @@ ] }, "predict": { - "A": 0.0031357912812381983, - "B": 0.0005800630315206945, - "C": 0.00021339324302971363, - "D": 0.9852362871170044 + "A": 0.001995829399675131, + "B": 7.738670683465898e-05, + "C": 9.334618516732007e-05, + "D": 0.9712265729904175 }, "sample": { "messages": [ @@ -90412,10 +90412,10 @@ ] }, "predict": { - "A": 0.001116361701861024, - "B": 0.032624874264001846, - "C": 0.00043717355583794415, - "D": 0.9534385800361633 + "A": 0.00020724798378068954, + "B": 0.005021159537136555, + "C": 0.00020724798378068954, + "D": 0.9568636417388916 }, "sample": { "messages": [ @@ -90457,10 +90457,10 @@ ] }, "predict": { - "A": 0.08000026643276215, - "B": 0.022920461371541023, - "C": 0.013901962898671627, - "D": 0.8600839972496033 + "A": 0.1394052803516388, + "B": 0.006940580438822508, + "C": 0.004481175914406776, + "D": 0.8022220134735107 }, "sample": { "messages": [ @@ -90502,10 +90502,10 @@ ] }, "predict": { - "A": 0.023883230984210968, - "B": 0.8962116241455078, - "C": 0.008786150254309177, - "D": 0.057292889803647995 + "A": 0.047176994383335114, + "B": 0.8362321853637695, + "C": 0.013516434468328953, + "D": 0.041633546352386475 }, "sample": { "messages": [ @@ -90547,10 +90547,10 @@ ] }, "predict": { - "A": 0.009137979708611965, - "B": 0.8225746154785156, - "C": 0.052585434168577194, - "D": 0.04095358029007912 + "A": 0.008955041877925396, + "B": 0.711387038230896, + "C": 0.027583468705415726, + "D": 0.15873190760612488 }, "sample": { "messages": [ @@ -90592,10 +90592,10 @@ ] }, "predict": { - "A": 0.9667589664459229, - "B": 0.0027154271956533194, - "C": 0.00028620389639399946, - "D": 0.00036749307764694095 + "A": 0.8922221064567566, + "B": 0.000633633288089186, + "C": 0.0001705400791252032, + "D": 0.0001505011023255065 }, "sample": { "messages": [ @@ -90630,17 +90630,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "C" ] }, "predict": { - "A": 0.15352444350719452, - "B": 0.026678547263145447, - "C": 0.25311902165412903, - "D": 0.5358529686927795 + "A": 0.19794118404388428, + "B": 0.002823479240760207, + "C": 0.6908825635910034, + "D": 0.03897693008184433 }, "sample": { "messages": [ @@ -90670,7 +90670,7 @@ "prompt_len": 97, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " C" } } { @@ -90682,10 +90682,10 @@ ] }, "predict": { - "A": 0.8949013948440552, - "B": 0.06482644379138947, - "C": 0.0006355360383167863, - "D": 0.0008160444558598101 + "A": 0.8875498175621033, + "B": 0.005277563817799091, + "C": 0.000192235253052786, + "D": 0.00014064231072552502 }, "sample": { "messages": [ @@ -90727,10 +90727,10 @@ ] }, "predict": { - "A": 0.751659631729126, - "B": 0.1306188553571701, - "C": 0.05445004254579544, - "D": 0.003944347612559795 + "A": 0.5061419010162354, + "B": 0.18619923293590546, + "C": 0.18619923293590546, + "D": 0.004378985613584518 }, "sample": { "messages": [ @@ -90772,10 +90772,10 @@ ] }, "predict": { - "A": 0.7975530028343201, - "B": 0.15704748034477234, - "C": 0.011376481503248215, - "D": 0.0015396394301205873 + "A": 0.9124104976654053, + "B": 0.024314915761351585, + "C": 0.0029040027875453234, + "D": 0.0003692024911288172 }, "sample": { "messages": [ @@ -90817,10 +90817,10 @@ ] }, "predict": { - "A": 0.0006960933678783476, - "B": 0.9801724553108215, - "C": 0.00025607843417674303, - "D": 0.00017599995771888644 + "A": 0.00041058860369957983, + "B": 0.895458996295929, + "C": 0.00020645689801312983, + "D": 9.752334153745323e-05 }, "sample": { "messages": [ @@ -90862,10 +90862,10 @@ ] }, "predict": { - "A": 0.0004849487741012126, - "B": 0.0006226864643394947, - "C": 0.9935535192489624, - "D": 0.00020215671975165606 + "A": 0.00016703609435353428, + "B": 7.412182458210737e-05, + "C": 0.9902472496032715, + "D": 0.00010131250746781006 }, "sample": { "messages": [ @@ -90907,10 +90907,10 @@ ] }, "predict": { - "A": 0.8555775284767151, - "B": 0.009504607878625393, - "C": 0.0007801856845617294, - "D": 0.07958110421895981 + "A": 0.802574634552002, + "B": 0.0042115249671041965, + "C": 0.00044389147660695016, + "D": 0.045278165489435196 }, "sample": { "messages": [ @@ -90952,10 +90952,10 @@ ] }, "predict": { - "A": 0.0011443396797403693, - "B": 0.9773334264755249, - "C": 0.00010644018038874492, - "D": 0.00042097907862626016 + "A": 0.0017353157745674253, + "B": 0.8989158272743225, + "C": 0.00011808973067672923, + "D": 0.000412173627410084 }, "sample": { "messages": [ @@ -90997,10 +90997,10 @@ ] }, "predict": { - "A": 0.7854203581809998, - "B": 0.10629507899284363, - "C": 0.0002985610335599631, - "D": 0.0568956583738327 + "A": 0.7407622337341309, + "B": 0.10025126487016678, + "C": 0.00015072169480845332, + "D": 0.006408849265426397 }, "sample": { "messages": [ @@ -91042,10 +91042,10 @@ ] }, "predict": { - "A": 0.8870975971221924, - "B": 0.0021988952066749334, - "C": 0.016247760504484177, - "D": 0.014338597655296326 + "A": 0.8161250352859497, + "B": 0.00021321962412912399, + "C": 0.00045138600398786366, + "D": 0.0005795910838060081 }, "sample": { "messages": [ @@ -91087,10 +91087,10 @@ ] }, "predict": { - "A": 0.00577247841283679, - "B": 0.970781683921814, - "C": 0.0011366684921085835, - "D": 0.002726726233959198 + "A": 0.0870538130402565, + "B": 0.825943648815155, + "C": 0.02826223522424698, + "D": 0.008097266778349876 }, "sample": { "messages": [ @@ -91132,10 +91132,10 @@ ] }, "predict": { - "A": 0.0006908804061822593, - "B": 0.9728320240974426, - "C": 0.00047483472735621035, - "D": 0.000609699753113091 + "A": 0.0003972995327785611, + "B": 0.9223596453666687, + "C": 0.00029067054856568575, + "D": 0.00022637446818407625 }, "sample": { "messages": [ @@ -91177,10 +91177,10 @@ ] }, "predict": { - "A": 0.9073923230171204, - "B": 0.005395551677793264, - "C": 0.01883232593536377, - "D": 0.0008274347055703402 + "A": 0.6864894032478333, + "B": 0.0017016371712088585, + "C": 0.02349037490785122, + "D": 0.0007093478925526142 }, "sample": { "messages": [ @@ -91222,10 +91222,10 @@ ] }, "predict": { - "A": 0.0024403166025877, - "B": 0.9844939112663269, - "C": 0.0003742346598301083, - "D": 0.00148012675344944 + "A": 0.003453811863437295, + "B": 0.9576463103294373, + "C": 0.00030179074383340776, + "D": 0.000497568747960031 }, "sample": { "messages": [ @@ -91267,10 +91267,10 @@ ] }, "predict": { - "A": 0.03256302699446678, - "B": 0.0056586056016385555, - "C": 0.9516311883926392, - "D": 0.0005263323546387255 + "A": 0.0027782113756984472, + "B": 0.00033181000617332757, + "C": 0.9891117215156555, + "D": 8.389470895053819e-05 }, "sample": { "messages": [ @@ -91312,10 +91312,10 @@ ] }, "predict": { - "A": 0.013067916966974735, - "B": 0.9161317348480225, - "C": 0.045611511915922165, - "D": 0.006994751747697592 + "A": 0.010187872685492039, + "B": 0.9170830249786377, + "C": 0.01903345063328743, + "D": 0.005453174933791161 }, "sample": { "messages": [ @@ -91357,10 +91357,10 @@ ] }, "predict": { - "A": 0.9448068141937256, - "B": 0.002066756598651409, - "C": 0.0005225578788667917, - "D": 0.013476944528520107 + "A": 0.9053977727890015, + "B": 0.00023654289543628693, + "C": 0.00023654289543628693, + "D": 0.00887620821595192 }, "sample": { "messages": [ @@ -91402,10 +91402,10 @@ ] }, "predict": { - "A": 0.000950414570979774, - "B": 0.0011464188573881984, - "C": 0.0006136337760835886, - "D": 0.9791090488433838 + "A": 0.001056003849953413, + "B": 0.0003884821489918977, + "C": 0.00036494521191343665, + "D": 0.9600563049316406 }, "sample": { "messages": [ @@ -91447,10 +91447,10 @@ ] }, "predict": { - "A": 0.12456435710191727, - "B": 0.10992767661809921, - "C": 0.7168184518814087, - "D": 0.024528177455067635 + "A": 0.20216524600982666, + "B": 0.09549611061811447, + "C": 0.6227128505706787, + "D": 0.01880429871380329 }, "sample": { "messages": [ @@ -91492,10 +91492,10 @@ ] }, "predict": { - "A": 0.0031223820988088846, - "B": 0.9810232520103455, - "C": 0.00029042677488178015, - "D": 0.0011486600851640105 + "A": 0.0026470115408301353, + "B": 0.9424012899398804, + "C": 0.00045998161658644676, + "D": 0.0026470115408301353 }, "sample": { "messages": [ @@ -91537,10 +91537,10 @@ ] }, "predict": { - "A": 0.48414817452430725, - "B": 0.3327498733997345, - "C": 0.024104317650198936, - "D": 0.1224118322134018 + "A": 0.43551504611968994, + "B": 0.2993248403072357, + "C": 0.024570079520344734, + "D": 0.1602170467376709 }, "sample": { "messages": [ @@ -91582,10 +91582,10 @@ ] }, "predict": { - "A": 0.0003846004547085613, - "B": 0.9504634737968445, - "C": 8.581596193835139e-05, - "D": 0.0005595903494395316 + "A": 0.000266148301307112, + "B": 0.8990138173103333, + "C": 7.163284317357466e-05, + "D": 0.0008726666565053165 }, "sample": { "messages": [ @@ -91627,10 +91627,10 @@ ] }, "predict": { - "A": 0.0011355539318174124, - "B": 0.0006887483177706599, - "C": 0.00032534165075048804, - "D": 0.969829797744751 + "A": 0.0012233153684064746, + "B": 0.00012893650273326784, + "C": 0.00014610418293159455, + "D": 0.9220179915428162 }, "sample": { "messages": [ @@ -91672,10 +91672,10 @@ ] }, "predict": { - "A": 0.5116617679595947, - "B": 0.013635324314236641, - "C": 0.08891347795724869, - "D": 0.31033855676651 + "A": 0.8280400633811951, + "B": 0.000625978980679065, + "C": 0.0011694827117025852, + "D": 0.11206303536891937 }, "sample": { "messages": [ @@ -91717,10 +91717,10 @@ ] }, "predict": { - "A": 0.007418905850499868, - "B": 0.006547161843627691, - "C": 0.0010687947506085038, - "D": 0.971684992313385 + "A": 0.019782697781920433, + "B": 0.004698802251368761, + "C": 0.0005973856896162033, + "D": 0.9531837701797485 }, "sample": { "messages": [ @@ -91762,10 +91762,10 @@ ] }, "predict": { - "A": 0.027955541387200356, - "B": 0.5615020990371704, - "C": 0.3405682146549225, - "D": 0.0062377252615988255 + "A": 0.013002580963075161, + "B": 0.7099168300628662, + "C": 0.13979090750217438, + "D": 0.004493571352213621 }, "sample": { "messages": [ @@ -91807,10 +91807,10 @@ ] }, "predict": { - "A": 0.9397386312484741, - "B": 0.004351853393018246, - "C": 0.00019120708748232573, - "D": 9.031983063323423e-05 + "A": 0.9032971262931824, + "B": 0.0007269130437634885, + "C": 6.761345139238983e-05, + "D": 3.0003282518009655e-05 }, "sample": { "messages": [ @@ -91852,10 +91852,10 @@ ] }, "predict": { - "A": 7.406481745420024e-05, - "B": 0.0011585685424506664, - "C": 0.0010224331635981798, - "D": 0.9894856214523315 + "A": 1.499170048191445e-05, + "B": 0.00046637814375571907, + "C": 0.006438151001930237, + "D": 0.9555063247680664 }, "sample": { "messages": [ @@ -91897,10 +91897,10 @@ ] }, "predict": { - "A": 0.0011347753461450338, - "B": 0.020114373415708542, - "C": 0.9691647887229919, - "D": 0.0016510884743183851 + "A": 0.0009091806132346392, + "B": 0.036317091435194016, + "C": 0.9366301894187927, + "D": 0.0006248701247386634 }, "sample": { "messages": [ @@ -91942,10 +91942,10 @@ ] }, "predict": { - "A": 0.019114753231406212, - "B": 0.9210004210472107, - "C": 0.013137364760041237, - "D": 0.013137364760041237 + "A": 0.004728667438030243, + "B": 0.9011245369911194, + "C": 0.007796254940330982, + "D": 0.00368269020691514 }, "sample": { "messages": [ @@ -91987,10 +91987,10 @@ ] }, "predict": { - "A": 0.9427652955055237, - "B": 0.0014173903036862612, - "C": 0.0002463054843246937, - "D": 0.0003162625362165272 + "A": 0.8663268089294434, + "B": 0.00018763853586278856, + "C": 0.00010691321949707344, + "D": 0.00014613305393140763 }, "sample": { "messages": [ @@ -92032,10 +92032,10 @@ ] }, "predict": { - "A": 0.9118844866752625, - "B": 0.01147893164306879, - "C": 0.013007333502173424, - "D": 0.03535759821534157 + "A": 0.844524621963501, + "B": 0.002093367278575897, + "C": 0.042046401649713516, + "D": 0.02889804169535637 }, "sample": { "messages": [ @@ -92077,10 +92077,10 @@ ] }, "predict": { - "A": 0.11409600079059601, - "B": 0.8430618047714233, - "C": 0.0008711337577551603, - "D": 0.0002202570904046297 + "A": 0.24998676776885986, + "B": 0.5996870994567871, + "C": 0.0007956530898809433, + "D": 0.00014718103921040893 }, "sample": { "messages": [ @@ -92118,14 +92118,14 @@ "acc": false, "f1_macro": [ "D", - "B" + "A" ] }, "predict": { - "A": 0.27512574195861816, - "B": 0.5824412107467651, - "C": 0.006470337510108948, - "D": 0.06956268101930618 + "A": 0.6931071877479553, + "B": 0.05020852014422417, + "C": 0.0028325708117336035, + "D": 0.17524494230747223 }, "sample": { "messages": [ @@ -92155,7 +92155,7 @@ "prompt_len": 61, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -92167,10 +92167,10 @@ ] }, "predict": { - "A": 0.05788116902112961, - "B": 0.039781104773283005, - "C": 0.08421659469604492, - "D": 0.7990248799324036 + "A": 0.0836656242609024, + "B": 0.00778211560100317, + "C": 0.25770828127861023, + "D": 0.6182100176811218 }, "sample": { "messages": [ @@ -92212,10 +92212,10 @@ ] }, "predict": { - "A": 0.004532734863460064, - "B": 0.0027492428198456764, - "C": 0.8637863993644714, - "D": 0.11690077185630798 + "A": 0.0012526224600151181, + "B": 0.00026256393175572157, + "C": 0.9441068172454834, + "D": 0.04700430855154991 }, "sample": { "messages": [ @@ -92257,10 +92257,10 @@ ] }, "predict": { - "A": 0.14016754925251007, - "B": 0.0214953925460577, - "C": 0.8066086173057556, - "D": 0.016740627586841583 + "A": 0.013655595481395721, + "B": 0.006450446788221598, + "C": 0.9573311805725098, + "D": 0.008282537572085857 }, "sample": { "messages": [ @@ -92302,10 +92302,10 @@ ] }, "predict": { - "A": 0.000906262022908777, - "B": 0.0004280878638383001, - "C": 0.9938369393348694, - "D": 0.00025964839733205736 + "A": 0.0004279213026165962, + "B": 9.548215166432783e-05, + "C": 0.9934502840042114, + "D": 0.00010164021659875289 }, "sample": { "messages": [ @@ -92340,17 +92340,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.28443628549575806, - "B": 0.00858923140913248, - "C": 0.003580524353310466, - "D": 0.682327151298523 + "A": 0.5064419507980347, + "B": 0.0013363067992031574, + "C": 0.0013363067992031574, + "D": 0.44693344831466675 }, "sample": { "messages": [ @@ -92380,7 +92380,7 @@ "prompt_len": 83, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -92392,10 +92392,10 @@ ] }, "predict": { - "A": 0.08701378852128983, - "B": 0.02200053073465824, - "C": 0.44189250469207764, - "D": 0.38996875286102295 + "A": 0.04769618436694145, + "B": 0.019882727414369583, + "C": 0.8454350829124451, + "D": 0.005027140490710735 }, "sample": { "messages": [ @@ -92437,10 +92437,10 @@ ] }, "predict": { - "A": 0.007984585128724575, - "B": 0.016903365030884743, - "C": 0.8144497871398926, - "D": 0.1415301412343979 + "A": 0.00463405204936862, + "B": 0.01832803152501583, + "C": 0.8830940127372742, + "D": 0.07248876988887787 }, "sample": { "messages": [ @@ -92482,10 +92482,10 @@ ] }, "predict": { - "A": 0.19260017573833466, - "B": 0.017914608120918274, - "C": 0.4620238244533539, - "D": 0.28023162484169006 + "A": 0.04113248363137245, + "B": 0.011784653179347515, + "C": 0.7290906310081482, + "D": 0.18434298038482666 }, "sample": { "messages": [ @@ -92527,10 +92527,10 @@ ] }, "predict": { - "A": 0.6544966697692871, - "B": 0.2124839574098587, - "C": 0.07816848158836365, - "D": 0.013583645224571228 + "A": 0.7563022375106812, + "B": 0.07034706324338913, + "C": 0.025879239663481712, + "D": 0.00654329638928175 }, "sample": { "messages": [ @@ -92572,10 +92572,10 @@ ] }, "predict": { - "A": 0.8333593606948853, - "B": 0.08783543109893799, - "C": 0.03661525622010231, - "D": 0.015263509936630726 + "A": 0.950793445110321, + "B": 0.002079852158203721, + "C": 0.0038856754545122385, + "D": 0.0010458152974024415 }, "sample": { "messages": [ @@ -92617,10 +92617,10 @@ ] }, "predict": { - "A": 0.20162905752658844, - "B": 0.7037545442581177, - "C": 0.008858962915837765, - "D": 0.04498952254652977 + "A": 0.06123223528265953, + "B": 0.8452848792076111, + "C": 0.0019683067221194506, + "D": 0.025525391101837158 }, "sample": { "messages": [ @@ -92662,10 +92662,10 @@ ] }, "predict": { - "A": 0.01762288436293602, - "B": 0.002240498084574938, - "C": 0.005049040541052818, - "D": 0.9621768593788147 + "A": 0.005731665529310703, + "B": 0.00034421327291056514, + "C": 0.0016421498730778694, + "D": 0.9639179706573486 }, "sample": { "messages": [ @@ -92707,10 +92707,10 @@ ] }, "predict": { - "A": 0.9393934607505798, - "B": 0.004929484333842993, - "C": 0.0038390865083783865, - "D": 0.002638563048094511 + "A": 0.9382728934288025, + "B": 0.0002032213960774243, + "C": 0.0005524130538105965, + "D": 0.0003566647064872086 }, "sample": { "messages": [ @@ -92752,10 +92752,10 @@ ] }, "predict": { - "A": 0.01767727918922901, - "B": 0.9651466608047485, - "C": 0.001544623402878642, - "D": 0.004469516221433878 + "A": 0.01743193343281746, + "B": 0.9517512321472168, + "C": 0.0008678847807459533, + "D": 0.001344206277281046 }, "sample": { "messages": [ @@ -92797,10 +92797,10 @@ ] }, "predict": { - "A": 0.0003718484949786216, - "B": 0.00047746291966177523, - "C": 0.0012978786835446954, - "D": 0.9782166481018066 + "A": 0.00024301533994730562, + "B": 0.00010130387818207964, + "C": 0.0014886496355757117, + "D": 0.9301718473434448 }, "sample": { "messages": [ @@ -92842,10 +92842,10 @@ ] }, "predict": { - "A": 0.0010227757738903165, - "B": 0.0013132700696587563, - "C": 0.9898172616958618, - "D": 9.513297118246555e-05 + "A": 0.0003292113833595067, + "B": 0.0002729258267208934, + "C": 0.9813653826713562, + "D": 3.469862713245675e-05 }, "sample": { "messages": [ @@ -92887,10 +92887,10 @@ ] }, "predict": { - "A": 0.9742199778556824, - "B": 0.0012925759656354785, - "C": 0.0012925759656354785, - "D": 4.4229520426597446e-05 + "A": 0.8918185234069824, + "B": 0.00018145685316994786, + "C": 0.0013407948426902294, + "D": 1.585552490723785e-05 }, "sample": { "messages": [ @@ -92925,17 +92925,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.17679518461227417, - "B": 0.7923410534858704, - "C": 0.0041578239761292934, - "D": 0.008802114054560661 + "A": 0.5144477486610413, + "B": 0.40065228939056396, + "C": 0.0021024304442107677, + "D": 0.0023823657538741827 }, "sample": { "messages": [ @@ -92965,7 +92965,7 @@ "prompt_len": 95, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -92977,10 +92977,10 @@ ] }, "predict": { - "A": 0.00819080788642168, - "B": 0.9467287063598633, - "C": 0.007228362374007702, - "D": 0.0056294542737305164 + "A": 0.0032150347251445055, + "B": 0.8914399743080139, + "C": 0.0014266630169004202, + "D": 0.002209658967331052 }, "sample": { "messages": [ @@ -93022,10 +93022,10 @@ ] }, "predict": { - "A": 0.9609648585319519, - "B": 0.004450150299817324, - "C": 0.0030585406348109245, - "D": 0.0011251741088926792 + "A": 0.912549614906311, + "B": 0.0005047169397585094, + "C": 0.0015546377981081605, + "D": 0.0001856749877333641 }, "sample": { "messages": [ @@ -93067,10 +93067,10 @@ ] }, "predict": { - "A": 0.9623891115188599, - "B": 0.005722574889659882, - "C": 0.00022188830189406872, - "D": 0.00112684175837785 + "A": 0.91374272108078, + "B": 0.000609600858297199, + "C": 6.425145693356171e-05, + "D": 0.00016407182556577027 }, "sample": { "messages": [ @@ -93112,10 +93112,10 @@ ] }, "predict": { - "A": 0.0019023822387680411, - "B": 0.9854583740234375, - "C": 0.0003305844438727945, - "D": 0.004563577938824892 + "A": 0.0008257658337242901, + "B": 0.9639659523963928, + "C": 9.264836262445897e-05, + "D": 0.0008257658337242901 }, "sample": { "messages": [ @@ -93150,17 +93150,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "C" + "A" ] }, "predict": { - "A": 0.2817660868167877, - "B": 0.012379937805235386, - "C": 0.6759216785430908, - "D": 0.0027623374480754137 + "A": 0.8204659819602966, + "B": 0.0017947619780898094, + "C": 0.09799068421125412, + "D": 0.0010885781375691295 }, "sample": { "messages": [ @@ -93190,7 +93190,7 @@ "prompt_len": 68, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -93202,10 +93202,10 @@ ] }, "predict": { - "A": 0.9117960929870605, - "B": 0.005421737674623728, - "C": 0.04539565369486809, - "D": 0.000504300172906369 + "A": 0.8506771326065063, + "B": 0.0005331420106813312, + "C": 0.032984331250190735, + "D": 0.00011175265535712242 }, "sample": { "messages": [ @@ -93247,10 +93247,10 @@ ] }, "predict": { - "A": 0.022597042843699455, - "B": 0.0018548782682046294, - "C": 0.0006823715521022677, - "D": 0.960850715637207 + "A": 0.024325072765350342, + "B": 0.000781928189098835, + "C": 0.000781928189098835, + "D": 0.9127916097640991 }, "sample": { "messages": [ @@ -93292,10 +93292,10 @@ ] }, "predict": { - "A": 0.9763860702514648, - "B": 0.0039902664721012115, - "C": 0.0003275410272181034, - "D": 0.0021358358208090067 + "A": 0.9451761841773987, + "B": 0.00046133605064824224, + "C": 5.865228740731254e-05, + "D": 0.0004910895950160921 }, "sample": { "messages": [ @@ -93337,10 +93337,10 @@ ] }, "predict": { - "A": 0.0911436676979065, - "B": 0.05528143048286438, - "C": 0.7631365656852722, - "D": 0.07098275423049927 + "A": 0.04447947070002556, + "B": 0.004990458954125643, + "C": 0.8933941125869751, + "D": 0.04447947070002556 }, "sample": { "messages": [ @@ -93382,10 +93382,10 @@ ] }, "predict": { - "A": 0.036501046270132065, - "B": 0.09922013431787491, - "C": 0.0071874819695949554, - "D": 0.8307600021362305 + "A": 0.007209750823676586, + "B": 0.011886870488524437, + "C": 0.0018229106208309531, + "D": 0.9442911148071289 }, "sample": { "messages": [ @@ -93427,10 +93427,10 @@ ] }, "predict": { - "A": 1.3643989404954482e-05, - "B": 1.4523950085276738e-05, - "C": 3.240731530240737e-06, - "D": 0.9853962063789368 + "A": 8.259922651632223e-06, + "B": 5.009896540286718e-06, + "C": 1.8430380350764608e-06, + "D": 0.9239518642425537 }, "sample": { "messages": [ @@ -93465,17 +93465,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "B", - "C" + "B" ] }, "predict": { - "A": 0.1172504797577858, - "B": 0.24821926653385162, - "C": 0.5954470634460449, - "D": 0.01586812734603882 + "A": 0.37813717126846313, + "B": 0.42848554253578186, + "C": 0.08437380194664001, + "D": 0.010077014565467834 }, "sample": { "messages": [ @@ -93505,7 +93505,7 @@ "prompt_len": 90, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " B" } } { @@ -93517,10 +93517,10 @@ ] }, "predict": { - "A": 0.004014366306364536, - "B": 0.0008957262034527957, - "C": 0.9822830557823181, - "D": 0.0021487355697900057 + "A": 0.0014815073227509856, + "B": 0.00011424157855799422, + "C": 0.9854121804237366, + "D": 0.0003987422678619623 }, "sample": { "messages": [ @@ -93562,10 +93562,10 @@ ] }, "predict": { - "A": 0.00042262146598659456, - "B": 0.0003503655025269836, - "C": 0.0007417238084599376, - "D": 0.9811463356018066 + "A": 0.00019338539277669042, + "B": 7.573080074507743e-05, + "C": 0.0006749813328497112, + "D": 0.9504445195198059 }, "sample": { "messages": [ @@ -93607,10 +93607,10 @@ ] }, "predict": { - "A": 0.9038271903991699, - "B": 0.027293216437101364, - "C": 0.007819637656211853, - "D": 0.0212559774518013 + "A": 0.8934168815612793, + "B": 0.004688221495598555, + "C": 0.0053124506957829, + "D": 0.018542274832725525 }, "sample": { "messages": [ @@ -93652,10 +93652,10 @@ ] }, "predict": { - "A": 0.009273647330701351, - "B": 0.013493076898157597, - "C": 0.0011075792135670781, - "D": 0.9459377527236938 + "A": 0.003572275163605809, + "B": 0.0043089864775538445, + "C": 0.00025877472944557667, + "D": 0.9304819107055664 }, "sample": { "messages": [ @@ -93697,10 +93697,10 @@ ] }, "predict": { - "A": 0.0724007785320282, - "B": 0.020743170753121376, - "C": 0.09296444803476334, - "D": 0.7783817648887634 + "A": 0.04288789629936218, + "B": 0.006178587209433317, + "C": 0.020258808508515358, + "D": 0.8614264726638794 }, "sample": { "messages": [ @@ -93742,10 +93742,10 @@ ] }, "predict": { - "A": 0.9582428932189941, - "B": 0.00829042587429285, - "C": 0.0007711297366768122, - "D": 0.0018498440040275455 + "A": 0.9278774857521057, + "B": 0.002606217050924897, + "C": 0.00048210163367912173, + "D": 0.0009587735985405743 }, "sample": { "messages": [ @@ -93787,10 +93787,10 @@ ] }, "predict": { - "A": 0.02099364809691906, - "B": 0.892672598361969, - "C": 0.011237090453505516, - "D": 0.05036114528775215 + "A": 0.04710138589143753, + "B": 0.8348919749259949, + "C": 0.028568431735038757, + "D": 0.03668259456753731 }, "sample": { "messages": [ @@ -93832,10 +93832,10 @@ ] }, "predict": { - "A": 0.015169886872172356, - "B": 0.028341079130768776, - "C": 0.0014110192423686385, - "D": 0.9385276436805725 + "A": 0.04849683865904808, + "B": 0.010821106843650341, + "C": 0.001880426425486803, + "D": 0.8596270084381104 }, "sample": { "messages": [ @@ -93873,14 +93873,14 @@ "acc": false, "f1_macro": [ "D", - "B" + "A" ] }, "predict": { - "A": 0.3114941716194153, - "B": 0.39996641874313354, - "C": 0.18893077969551086, - "D": 0.06133684143424034 + "A": 0.638334333896637, + "B": 0.16139619052410126, + "C": 0.06727994233369827, + "D": 0.019276026636362076 }, "sample": { "messages": [ @@ -93910,7 +93910,7 @@ "prompt_len": 70, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -93922,10 +93922,10 @@ ] }, "predict": { - "A": 0.00042513423250056803, - "B": 0.9869799613952637, - "C": 0.0007009278633631766, - "D": 0.0007009278633631766 + "A": 0.00040578973130322993, + "B": 0.9420702457427979, + "C": 0.000807009229902178, + "D": 0.0003160293563269079 }, "sample": { "messages": [ @@ -93967,10 +93967,10 @@ ] }, "predict": { - "A": 0.951788604259491, - "B": 0.004994528368115425, - "C": 0.0038897425401955843, - "D": 0.0006759358802810311 + "A": 0.8891735076904297, + "B": 0.0012558245798572898, + "C": 0.0022040409967303276, + "D": 0.00021822961571160704 }, "sample": { "messages": [ @@ -94005,17 +94005,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.22050818800926208, - "B": 0.03831858187913895, - "C": 0.03831858187913895, - "D": 0.6792130470275879 + "A": 0.8526961207389832, + "B": 0.009472599253058434, + "C": 0.04810582846403122, + "D": 0.025749191641807556 }, "sample": { "messages": [ @@ -94045,22 +94045,22 @@ "prompt_len": 84, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "D", - "D" + "A" ] }, "predict": { - "A": 0.35704100131988525, - "B": 0.13134804368019104, - "C": 0.07030554115772247, - "D": 0.4045804738998413 + "A": 0.5453040599822998, + "B": 0.044761281460523605, + "C": 0.06512728333473206, + "D": 0.2918802499771118 }, "sample": { "messages": [ @@ -94090,7 +94090,7 @@ "prompt_len": 82, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -94102,10 +94102,10 @@ ] }, "predict": { - "A": 0.006243356037884951, - "B": 0.05227498337626457, - "C": 0.9265962243080139, - "D": 0.004862330853939056 + "A": 0.0031321635469794273, + "B": 0.003549206303432584, + "C": 0.9840965270996094, + "D": 0.0021527025382965803 }, "sample": { "messages": [ @@ -94147,10 +94147,10 @@ ] }, "predict": { - "A": 0.003989505115896463, - "B": 0.9761996865272522, - "C": 0.00032747851219028234, - "D": 0.0006118103628978133 + "A": 0.0029736021533608437, + "B": 0.9342779517173767, + "C": 0.0002440880925860256, + "D": 0.0006233008462004364 }, "sample": { "messages": [ @@ -94192,10 +94192,10 @@ ] }, "predict": { - "A": 0.0024207059759646654, - "B": 0.9765824675559998, - "C": 0.000476665300084278, - "D": 0.0024207059759646654 + "A": 0.001682171248830855, + "B": 0.9275858998298645, + "C": 0.00021386407024692744, + "D": 0.002605397952720523 }, "sample": { "messages": [ @@ -94237,10 +94237,10 @@ ] }, "predict": { - "A": 0.9511635303497314, - "B": 0.013567619025707245, - "C": 0.0007654327200725675, - "D": 0.0004097066121175885 + "A": 0.8913914561271667, + "B": 0.0011826807167381048, + "C": 0.00040872354293242097, + "D": 0.00015036098193377256 }, "sample": { "messages": [ @@ -94282,10 +94282,10 @@ ] }, "predict": { - "A": 0.9228984713554382, - "B": 0.0029373837169259787, - "C": 0.0017816132167354226, - "D": 0.04054933041334152 + "A": 0.9268569946289062, + "B": 0.0004523945099208504, + "C": 0.0005126300966367126, + "D": 0.024699902161955833 }, "sample": { "messages": [ @@ -94327,10 +94327,10 @@ ] }, "predict": { - "A": 0.9617542624473572, - "B": 0.0006830132333561778, - "C": 2.0625211618607864e-05, - "D": 4.602107310347492e-06 + "A": 0.9239452481269836, + "B": 0.00010062574438052252, + "C": 1.1289895155641716e-05, + "D": 1.843024847403285e-06 }, "sample": { "messages": [ @@ -94372,10 +94372,10 @@ ] }, "predict": { - "A": 0.0249035581946373, - "B": 0.006296605337411165, - "C": 0.013329913839697838, - "D": 0.9344991445541382 + "A": 0.010643110610544682, + "B": 0.001968777272850275, + "C": 0.004722851328551769, + "D": 0.9580622315406799 }, "sample": { "messages": [ @@ -94417,10 +94417,10 @@ ] }, "predict": { - "A": 0.9559547305107117, - "B": 0.0020911425817757845, - "C": 0.0004117703647352755, - "D": 0.0023695749696344137 + "A": 0.9041143655776978, + "B": 7.668537728022784e-05, + "C": 3.0030425477889366e-05, + "D": 0.0001264328311663121 }, "sample": { "messages": [ @@ -94462,10 +94462,10 @@ ] }, "predict": { - "A": 0.0019120094366371632, - "B": 0.0019120094366371632, - "C": 0.9904453754425049, - "D": 0.0004266269679646939 + "A": 0.0013156709028407931, + "B": 0.00024337467039003968, + "C": 0.9916267395019531, + "D": 8.410802547587082e-05 }, "sample": { "messages": [ @@ -94507,10 +94507,10 @@ ] }, "predict": { - "A": 0.006566641386598349, - "B": 0.008431734517216682, - "C": 0.974575936794281, - "D": 0.001141111133620143 + "A": 0.0027633379213511944, + "B": 0.0021520897280424833, + "C": 0.983816385269165, + "D": 0.00025703044957481325 }, "sample": { "messages": [ @@ -94552,10 +94552,10 @@ ] }, "predict": { - "A": 0.0007012386922724545, - "B": 0.0009004083112813532, - "C": 0.9874175786972046, - "D": 7.391002145595849e-05 + "A": 0.0003090041864197701, + "B": 0.0002561734290793538, + "C": 0.9805361032485962, + "D": 4.451627319213003e-05 }, "sample": { "messages": [ @@ -94597,10 +94597,10 @@ ] }, "predict": { - "A": 0.016791457310318947, - "B": 0.1593129187822342, - "C": 0.004245545715093613, - "D": 0.8090577721595764 + "A": 0.003386838361620903, + "B": 0.032133426517248154, + "C": 0.0017030092421919107, + "D": 0.9390764236450195 }, "sample": { "messages": [ @@ -94642,10 +94642,10 @@ ] }, "predict": { - "A": 0.0020318329334259033, - "B": 0.0009016203694045544, - "C": 0.0462443046271801, - "D": 0.9288417100906372 + "A": 0.0005450840690173209, + "B": 0.00021345799905247986, + "C": 0.013206178322434425, + "D": 0.9258246421813965 }, "sample": { "messages": [ @@ -94687,10 +94687,10 @@ ] }, "predict": { - "A": 0.01975506916642189, - "B": 0.008235137909650803, - "C": 0.951852560043335, - "D": 0.0011145047610625625 + "A": 0.008412945084273815, + "B": 0.002731283660978079, + "C": 0.9724043011665344, + "D": 0.0003262053069192916 }, "sample": { "messages": [ @@ -94732,10 +94732,10 @@ ] }, "predict": { - "A": 0.03821706771850586, - "B": 0.19408227503299713, - "C": 0.06300928443670273, - "D": 0.6774137020111084 + "A": 0.06583190709352493, + "B": 0.045245565474033356, + "C": 0.045245565474033356, + "D": 0.8019967675209045 }, "sample": { "messages": [ @@ -94770,17 +94770,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "B", - "A" + "B" ] }, "predict": { - "A": 0.4957602918148041, - "B": 0.43750691413879395, - "C": 0.02468245103955269, - "D": 0.011659164912998676 + "A": 0.0722910538315773, + "B": 0.8806853294372559, + "C": 0.009783530607819557, + "D": 0.009783530607819557 }, "sample": { "messages": [ @@ -94810,7 +94810,7 @@ "prompt_len": 72, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " B" } } { @@ -94822,10 +94822,10 @@ ] }, "predict": { - "A": 0.01453256607055664, - "B": 0.1770429015159607, - "C": 0.0004972770111635327, - "D": 0.793451189994812 + "A": 0.031724195927381516, + "B": 0.010299339890480042, + "C": 0.0005127738695591688, + "D": 0.9271169304847717 }, "sample": { "messages": [ @@ -94867,10 +94867,10 @@ ] }, "predict": { - "A": 0.011914984323084354, - "B": 0.5740960836410522, - "C": 0.39457011222839355, - "D": 0.0008631185628473759 + "A": 0.0060209548100829124, + "B": 0.614154040813446, + "C": 0.32873299717903137, + "D": 0.0005600359872914851 }, "sample": { "messages": [ @@ -94912,10 +94912,10 @@ ] }, "predict": { - "A": 0.07373818755149841, - "B": 0.00997937936335802, - "C": 0.8983150720596313, - "D": 0.0025231821928173304 + "A": 0.036297351121902466, + "B": 0.002979468321427703, + "C": 0.9361210465431213, + "D": 0.0012420271523296833 }, "sample": { "messages": [ @@ -94957,10 +94957,10 @@ ] }, "predict": { - "A": 0.02487027272582054, - "B": 0.9332500696182251, - "C": 0.017093071714043617, - "D": 0.0020414763130247593 + "A": 0.3270110785961151, + "B": 0.6109371185302734, + "C": 0.007690563332289457, + "D": 0.003205903572961688 }, "sample": { "messages": [ @@ -95002,10 +95002,10 @@ ] }, "predict": { - "A": 0.009455325081944466, - "B": 0.964469313621521, - "C": 0.0012796389637514949, - "D": 0.0012796389637514949 + "A": 0.015184142626821995, + "B": 0.9394096732139587, + "C": 0.0017036135541275144, + "D": 0.0020549504552036524 }, "sample": { "messages": [ @@ -95047,10 +95047,10 @@ ] }, "predict": { - "A": 0.9597948789596558, - "B": 0.0014429931761696935, - "C": 0.00019528789562173188, - "D": 0.00014287566591519862 + "A": 0.8927576541900635, + "B": 0.0001172807315015234, + "C": 6.682454841211438e-05, + "D": 4.5927794417366385e-05 }, "sample": { "messages": [ @@ -95092,10 +95092,10 @@ ] }, "predict": { - "A": 9.709272853797302e-05, - "B": 0.03679681196808815, - "C": 3.152139106532559e-05, - "D": 0.9490022659301758 + "A": 8.39135391288437e-05, + "B": 0.017022427171468735, + "C": 1.8723640096141025e-05, + "D": 0.9293929934501648 }, "sample": { "messages": [ @@ -95137,10 +95137,10 @@ ] }, "predict": { - "A": 0.0012965656351298094, - "B": 0.977226972579956, - "C": 0.00017547106835991144, - "D": 0.0003714722697623074 + "A": 0.002067652065306902, + "B": 0.9452161192893982, + "C": 0.00033753487514331937, + "D": 0.0006305982242338359 }, "sample": { "messages": [ @@ -95182,10 +95182,10 @@ ] }, "predict": { - "A": 0.006488629151135683, - "B": 0.004459565505385399, - "C": 0.0030650116968899965, - "D": 0.9629979729652405 + "A": 0.0038366534281522036, + "B": 0.0015024550957605243, + "C": 0.0008560730493627489, + "D": 0.9387981295585632 }, "sample": { "messages": [ @@ -95227,10 +95227,10 @@ ] }, "predict": { - "A": 0.569393515586853, - "B": 0.3453546464443207, - "C": 0.025017406791448593, - "D": 0.022077783942222595 + "A": 0.8925442099571228, + "B": 0.030541183426976204, + "C": 0.003647624282166362, + "D": 0.006013915408402681 }, "sample": { "messages": [ @@ -95272,10 +95272,10 @@ ] }, "predict": { - "A": 0.25843170285224915, - "B": 0.01652098447084427, - "C": 0.002533575054258108, - "D": 0.7024900913238525 + "A": 0.23677228391170502, + "B": 0.004336635582149029, + "C": 0.0016982508823275566, + "D": 0.7293099761009216 }, "sample": { "messages": [ @@ -95317,10 +95317,10 @@ ] }, "predict": { - "A": 0.000775238499045372, - "B": 0.9633486866950989, - "C": 0.0006037562852725387, - "D": 0.0006037562852725387 + "A": 0.0024781087413430214, + "B": 0.8822677731513977, + "C": 0.0342092327773571, + "D": 0.00232796766795218 }, "sample": { "messages": [ @@ -95362,10 +95362,10 @@ ] }, "predict": { - "A": 0.003563427599146962, - "B": 0.0014854575274512172, - "C": 0.9880396723747253, - "D": 0.0005464692949317396 + "A": 0.0027757929638028145, + "B": 0.00014711162657476962, + "C": 0.988250732421875, + "D": 0.00012195982708362862 }, "sample": { "messages": [ @@ -95407,10 +95407,10 @@ ] }, "predict": { - "A": 0.01950622722506523, - "B": 0.015191464684903622, - "C": 0.9398626685142517, - "D": 0.009214090183377266 + "A": 0.005135287996381521, + "B": 0.0008383127278648317, + "C": 0.9786126613616943, + "D": 0.00012855941895395517 }, "sample": { "messages": [ @@ -95452,10 +95452,10 @@ ] }, "predict": { - "A": 0.0011554865632206202, - "B": 0.9868534803390503, - "C": 0.00029215277754701674, - "D": 0.00022752879885956645 + "A": 0.0006287945434451103, + "B": 0.942512571811676, + "C": 0.0001318025024374947, + "D": 0.00014030301827006042 }, "sample": { "messages": [ @@ -95497,10 +95497,10 @@ ] }, "predict": { - "A": 0.07094502449035645, - "B": 0.009601365774869919, - "C": 0.03351205587387085, - "D": 0.8642873764038086 + "A": 0.06467977911233902, + "B": 0.0008667095098644495, + "C": 0.004685387481004, + "D": 0.8928768038749695 }, "sample": { "messages": [ @@ -95535,17 +95535,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "A" + "D" ] }, "predict": { - "A": 0.4958096742630005, - "B": 0.031696051359176636, - "C": 0.01497215498238802, - "D": 0.43755051493644714 + "A": 0.0057778083719313145, + "B": 0.0015550762182101607, + "C": 0.0019967572297900915, + "D": 0.9716780185699463 }, "sample": { "messages": [ @@ -95575,7 +95575,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -95587,10 +95587,10 @@ ] }, "predict": { - "A": 0.00042054394725710154, - "B": 0.0005399890942499042, - "C": 0.0004765387566294521, - "D": 0.9763231873512268 + "A": 0.0002330410061404109, + "B": 8.573100058129057e-05, + "C": 0.00019319778948556632, + "D": 0.8919938206672668 }, "sample": { "messages": [ @@ -95632,10 +95632,10 @@ ] }, "predict": { - "A": 0.012166975997388363, - "B": 0.012166975997388363, - "C": 0.0012823898578062654, - "D": 0.9665426015853882 + "A": 0.0031079335603863, + "B": 0.0012955793645232916, + "C": 0.00028908284730277956, + "D": 0.9764837026596069 }, "sample": { "messages": [ @@ -95677,10 +95677,10 @@ ] }, "predict": { - "A": 0.00013601180398836732, - "B": 5.003594560548663e-05, - "C": 1.1164528586959932e-05, - "D": 0.9726130962371826 + "A": 7.20495663699694e-05, + "B": 1.939187859534286e-05, + "C": 9.750849130796269e-06, + "D": 0.9042438864707947 }, "sample": { "messages": [ @@ -95722,10 +95722,10 @@ ] }, "predict": { - "A": 0.17311158776283264, - "B": 0.016101883724331856, - "C": 0.002179153263568878, - "D": 0.7758322358131409 + "A": 0.06414588540792465, + "B": 0.0009740038076415658, + "C": 0.0005213466938585043, + "D": 0.8855066299438477 }, "sample": { "messages": [ @@ -95767,10 +95767,10 @@ ] }, "predict": { - "A": 0.0021228152327239513, - "B": 0.97043377161026, - "C": 0.00023817329201847315, - "D": 0.00047366414219141006 + "A": 0.0017041207756847143, + "B": 0.8827564120292664, + "C": 0.00033556128619238734, + "D": 0.0007103832322172821 }, "sample": { "messages": [ @@ -95812,10 +95812,10 @@ ] }, "predict": { - "A": 0.9180628061294556, - "B": 0.040336865931749344, - "C": 0.0015640301862731576, - "D": 0.005459000822156668 + "A": 0.8772841691970825, + "B": 0.012513786554336548, + "C": 0.0009064956684596837, + "D": 0.0027922033332288265 }, "sample": { "messages": [ @@ -95857,10 +95857,10 @@ ] }, "predict": { - "A": 0.025580136105418205, - "B": 0.9598875641822815, - "C": 0.00036488106707111, - "D": 0.0020997454412281513 + "A": 0.04017629846930504, + "B": 0.9144083261489868, + "C": 0.0005057449452579021, + "D": 0.0029103613924235106 }, "sample": { "messages": [ @@ -95902,10 +95902,10 @@ ] }, "predict": { - "A": 0.006548015400767326, - "B": 0.006548015400767326, - "C": 0.9718116521835327, - "D": 0.007419873028993607 + "A": 0.0007003323989920318, + "B": 0.0003748609160538763, + "C": 0.9861414432525635, + "D": 0.0006579014007002115 }, "sample": { "messages": [ @@ -95947,10 +95947,10 @@ ] }, "predict": { - "A": 0.002343769883736968, - "B": 0.004961760714650154, - "C": 0.004961760714650154, - "D": 0.9455441832542419 + "A": 0.0010886521777138114, + "B": 0.0026115411892533302, + "C": 0.0011588643537834287, + "D": 0.9297729730606079 }, "sample": { "messages": [ @@ -95992,10 +95992,10 @@ ] }, "predict": { - "A": 0.9666689038276672, - "B": 0.001866110018454492, - "C": 0.00019668655295390636, - "D": 0.0002525505260564387 + "A": 0.9236844778060913, + "B": 0.00014636827108915895, + "C": 4.4639804400503635e-05, + "D": 2.7075408070231788e-05 }, "sample": { "messages": [ @@ -96037,10 +96037,10 @@ ] }, "predict": { - "A": 0.017717229202389717, - "B": 0.0021160212345421314, - "C": 0.0001532840688014403, - "D": 0.9673279523849487 + "A": 0.011568084359169006, + "B": 0.0007395229185931385, + "C": 0.00012850981147494167, + "D": 0.9189667701721191 }, "sample": { "messages": [ @@ -96082,10 +96082,10 @@ ] }, "predict": { - "A": 0.0004846524097956717, - "B": 0.00022893356799613684, - "C": 0.9929463267326355, - "D": 0.00015734358748886734 + "A": 0.00024307260173372924, + "B": 4.2239676986355335e-05, + "C": 0.9903959631919861, + "D": 2.9030879886704497e-05 }, "sample": { "messages": [ @@ -96120,17 +96120,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "A" + "C" ] }, "predict": { - "A": 0.4330191910266876, - "B": 0.337235689163208, - "C": 0.18050925433635712, - "D": 0.016789976507425308 + "A": 0.28300413489341736, + "B": 0.08108203858137131, + "C": 0.5991197228431702, + "D": 0.0029534129425883293 }, "sample": { "messages": [ @@ -96160,7 +96160,7 @@ "prompt_len": 92, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -96172,10 +96172,10 @@ ] }, "predict": { - "A": 0.00048150643124245107, - "B": 0.9865009188652039, - "C": 1.4540232768922579e-05, - "D": 0.00012174388393759727 + "A": 0.0008481965051032603, + "B": 0.930160403251648, + "C": 2.2603670004173182e-05, + "D": 0.00015690058353357017 }, "sample": { "messages": [ @@ -96217,10 +96217,10 @@ ] }, "predict": { - "A": 0.08783292025327682, - "B": 0.23875464498996735, - "C": 0.6490023732185364, - "D": 0.006362595595419407 + "A": 0.0400928370654583, + "B": 0.027555376291275024, + "C": 0.9125087261199951, + "D": 0.0007343259057961404 }, "sample": { "messages": [ @@ -96262,10 +96262,10 @@ ] }, "predict": { - "A": 0.0011687185615301132, - "B": 0.09284280985593796, - "C": 0.002474177395924926, - "D": 0.8808680772781372 + "A": 0.00043447106145322323, + "B": 0.01735488511621952, + "C": 0.0008116987883113325, + "D": 0.9475446343421936 }, "sample": { "messages": [ @@ -96307,10 +96307,10 @@ ] }, "predict": { - "A": 0.9403039813041687, - "B": 0.0018152137054130435, - "C": 0.0008574462844990194, - "D": 0.00016884117212612182 + "A": 0.8656582832336426, + "B": 0.00022616061323788017, + "C": 0.0003502844483591616, + "D": 6.479611329268664e-05 }, "sample": { "messages": [ @@ -96352,10 +96352,10 @@ ] }, "predict": { - "A": 0.0008897993830032647, - "B": 0.9757835268974304, - "C": 0.0002549317723605782, - "D": 0.009566245600581169 + "A": 0.0008298321045003831, + "B": 0.9100213646888733, + "C": 0.0003052781685255468, + "D": 0.04530729353427887 }, "sample": { "messages": [ @@ -96397,10 +96397,10 @@ ] }, "predict": { - "A": 0.057820241898298264, - "B": 0.010047652758657932, - "C": 0.002241934183984995, - "D": 0.9044607877731323 + "A": 0.040493693202733994, + "B": 0.002016062382608652, + "C": 0.0007416678708977997, + "D": 0.9216322302818298 }, "sample": { "messages": [ @@ -96442,10 +96442,10 @@ ] }, "predict": { - "A": 0.9524675011634827, - "B": 0.00824045855551958, - "C": 0.00028197301435284317, - "D": 0.0008685379289090633 + "A": 0.8915980458259583, + "B": 0.0011829547584056854, + "C": 8.569296187488362e-05, + "D": 0.0002188246580772102 }, "sample": { "messages": [ @@ -96487,10 +96487,10 @@ ] }, "predict": { - "A": 0.2310730218887329, - "B": 0.6281214952468872, - "C": 0.10915113985538483, - "D": 0.00290877977386117 + "A": 0.2500893175601959, + "B": 0.5294390320777893, + "C": 0.15168683230876923, + "D": 0.001685088500380516 }, "sample": { "messages": [ @@ -96532,10 +96532,10 @@ ] }, "predict": { - "A": 0.00022970463032834232, - "B": 9.575514559401199e-05, - "C": 0.9962906837463379, - "D": 0.00012295202759560198 + "A": 9.514645353192464e-05, + "B": 3.500242382870056e-05, + "C": 0.9899575114250183, + "D": 4.784264456247911e-05 }, "sample": { "messages": [ @@ -96577,10 +96577,10 @@ ] }, "predict": { - "A": 0.9695615768432617, - "B": 0.0012863953597843647, - "C": 0.0002870336174964905, - "D": 0.002403303049504757 + "A": 0.8974296450614929, + "B": 0.0003411390061955899, + "C": 0.00030105409678071737, + "D": 0.006851959507912397 }, "sample": { "messages": [ @@ -96622,10 +96622,10 @@ ] }, "predict": { - "A": 0.9766522645950317, - "B": 0.002136418130248785, - "C": 1.8483638996258378e-05, - "D": 2.689353823370766e-05 + "A": 0.9143654704093933, + "B": 0.00034757680259644985, + "C": 1.0495909009478055e-05, + "D": 8.174222784873564e-06 }, "sample": { "messages": [ @@ -96660,17 +96660,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "D" ] }, "predict": { - "A": 0.7269444465637207, - "B": 0.006289301905781031, - "C": 0.0018019151175394654, - "D": 0.2360043078660965 + "A": 0.3270598351955414, + "B": 0.002203711774200201, + "C": 0.0008629859075881541, + "D": 0.6110281944274902 }, "sample": { "messages": [ @@ -96700,7 +96700,7 @@ "prompt_len": 75, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -96712,10 +96712,10 @@ ] }, "predict": { - "A": 0.21620002388954163, - "B": 0.010763964615762234, - "C": 0.003959841560572386, - "D": 0.7546122074127197 + "A": 0.17936478555202484, + "B": 0.002121072029694915, + "C": 0.0008306236704811454, + "D": 0.803857147693634 }, "sample": { "messages": [ @@ -96757,10 +96757,10 @@ ] }, "predict": { - "A": 0.8987486362457275, - "B": 0.04474605992436409, - "C": 0.011313576251268387, - "D": 0.0004970838199369609 + "A": 0.8854592442512512, + "B": 0.004100489895790815, + "C": 0.0036186696961522102, + "D": 0.00023133378999773413 }, "sample": { "messages": [ @@ -96802,10 +96802,10 @@ ] }, "predict": { - "A": 0.0009010927169583738, - "B": 0.0006193113513290882, - "C": 0.0004256460815668106, - "D": 0.9881681799888611 + "A": 0.00042033640784211457, + "B": 9.983864583773538e-05, + "C": 0.00022498986800201237, + "D": 0.9758414030075073 }, "sample": { "messages": [ @@ -96847,10 +96847,10 @@ ] }, "predict": { - "A": 0.4534696936607361, - "B": 0.002696429379284382, - "C": 0.002696429379284382, - "D": 0.5138484835624695 + "A": 0.23494406044483185, + "B": 0.0005470842006616294, + "C": 0.0010220878757536411, + "D": 0.7236786484718323 }, "sample": { "messages": [ @@ -96885,17 +96885,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.25313472747802734, - "B": 0.6880914568901062, - "C": 0.003186495741829276, - "D": 0.002812072401866317 + "A": 0.5810291767120361, + "B": 0.2137487232685089, + "C": 0.003677749540656805, + "D": 0.002095518633723259 }, "sample": { "messages": [ @@ -96925,7 +96925,7 @@ "prompt_len": 65, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -96937,10 +96937,10 @@ ] }, "predict": { - "A": 0.03217786177992821, - "B": 0.9403749108314514, - "C": 0.0001913365558721125, - "D": 0.00010902028589043766 + "A": 0.021891359239816666, + "B": 0.930844247341156, + "C": 8.404457184951752e-05, + "D": 0.0001222841237904504 }, "sample": { "messages": [ @@ -96982,10 +96982,10 @@ ] }, "predict": { - "A": 0.02517315372824669, - "B": 0.0118909552693367, - "C": 0.8336203098297119, - "D": 0.11281824111938477 + "A": 0.01938365027308464, + "B": 0.0038168674800544977, + "C": 0.9339566230773926, + "D": 0.024889100342988968 }, "sample": { "messages": [ @@ -97027,10 +97027,10 @@ ] }, "predict": { - "A": 0.010677922517061234, - "B": 0.9611958265304565, - "C": 0.0004691551439464092, - "D": 0.015536284074187279 + "A": 0.008331244811415672, + "B": 0.962960958480835, + "C": 0.0007279760320670903, + "D": 0.007352297194302082 }, "sample": { "messages": [ @@ -97072,10 +97072,10 @@ ] }, "predict": { - "A": 0.01712404564023018, - "B": 0.02823277749121189, - "C": 0.9349411725997925, - "D": 0.009165841154754162 + "A": 0.012160759419202805, + "B": 0.007375872693955898, + "C": 0.9660487771034241, + "D": 0.0034841159358620644 }, "sample": { "messages": [ @@ -97117,10 +97117,10 @@ ] }, "predict": { - "A": 0.009431486018002033, - "B": 0.9620376825332642, - "C": 0.0008772648288868368, - "D": 0.002104449085891247 + "A": 0.006662277039140463, + "B": 0.872586190700531, + "C": 0.0005468730232678354, + "D": 0.0014865552075207233 }, "sample": { "messages": [ @@ -97162,10 +97162,10 @@ ] }, "predict": { - "A": 0.012123208492994308, - "B": 0.013737394474446774, - "C": 0.9630657434463501, - "D": 0.0011276340810582042 + "A": 0.005795566830784082, + "B": 0.002132070017978549, + "C": 0.9746645092964172, + "D": 0.0001750109513523057 }, "sample": { "messages": [ @@ -97207,10 +97207,10 @@ ] }, "predict": { - "A": 0.009754152037203312, - "B": 0.8780407309532166, - "C": 0.002794611267745495, - "D": 0.08167050778865814 + "A": 0.0198908019810915, + "B": 0.7463968396186829, + "C": 0.0034565033856779337, + "D": 0.18871867656707764 }, "sample": { "messages": [ @@ -97252,10 +97252,10 @@ ] }, "predict": { - "A": 0.01738722436130047, - "B": 0.015344171784818172, - "C": 0.002076607896015048, - "D": 0.949310302734375 + "A": 0.004485775716602802, + "B": 0.0024010625202208757, + "C": 0.000570302305277437, + "D": 0.9686577320098877 }, "sample": { "messages": [ @@ -97297,10 +97297,10 @@ ] }, "predict": { - "A": 0.004535394720733166, - "B": 0.008473231457173824, - "C": 0.9793724417686462, - "D": 0.0006955252029001713 + "A": 0.004559693392366171, + "B": 0.0011528709437698126, + "C": 0.9846195578575134, + "D": 0.0001376908039674163 }, "sample": { "messages": [ @@ -97342,10 +97342,10 @@ ] }, "predict": { - "A": 0.00010644479334587231, - "B": 0.9773757457733154, - "C": 9.900907571136486e-06, - "D": 1.9690307453856803e-05 + "A": 8.848968718666583e-05, + "B": 0.9206967353820801, + "C": 8.761665412748698e-06, + "D": 2.1018142433604226e-05 }, "sample": { "messages": [ @@ -97387,10 +97387,10 @@ ] }, "predict": { - "A": 0.012127314694225788, - "B": 0.012127314694225788, - "C": 0.963391900062561, - "D": 0.0004149738815613091 + "A": 0.005797699559479952, + "B": 0.009558791294693947, + "C": 0.9750231504440308, + "D": 0.00021118119184393436 }, "sample": { "messages": [ @@ -97432,10 +97432,10 @@ ] }, "predict": { - "A": 0.9462804794311523, - "B": 0.0034128203988075256, - "C": 0.010512227192521095, - "D": 0.000523372960742563 + "A": 0.9375969767570496, + "B": 0.0003348141035530716, + "C": 0.0015973089030012488, + "D": 0.00011570865899557248 }, "sample": { "messages": [ @@ -97477,10 +97477,10 @@ ] }, "predict": { - "A": 0.5109453201293945, - "B": 0.1291871964931488, - "C": 0.2734893262386322, - "D": 0.025438467040657997 + "A": 0.6678143739700317, + "B": 0.020166246220469475, + "C": 0.21680757403373718, + "D": 0.004499698057770729 }, "sample": { "messages": [ @@ -97522,10 +97522,10 @@ ] }, "predict": { - "A": 0.17392145097255707, - "B": 0.7794618606567383, - "C": 0.0014135491801425815, - "D": 0.004090240225195885 + "A": 0.12557898461818695, + "B": 0.8188778162002563, + "C": 0.000482118601212278, + "D": 0.0016827592626214027 }, "sample": { "messages": [ @@ -97567,10 +97567,10 @@ ] }, "predict": { - "A": 0.028519146144390106, - "B": 0.010491607710719109, - "C": 0.0017127078026533127, - "D": 0.9444243907928467 + "A": 0.008322618901729584, + "B": 0.0010581021197140217, + "C": 0.0004695300303865224, + "D": 0.9619638919830322 }, "sample": { "messages": [ @@ -97612,10 +97612,10 @@ ] }, "predict": { - "A": 0.0014677448198199272, - "B": 0.00022508595429826528, - "C": 3.4518052416387945e-05, - "D": 0.9762582182884216 + "A": 0.0005150812794454396, + "B": 6.970867252675816e-05, + "C": 2.2631091269431636e-05, + "D": 0.9312887787818909 }, "sample": { "messages": [ @@ -97657,10 +97657,10 @@ ] }, "predict": { - "A": 0.11006128042936325, - "B": 0.004835755098611116, - "C": 0.04588036984205246, - "D": 0.8132489919662476 + "A": 0.056366972625255585, + "B": 0.0015021291328594089, + "C": 0.016149409115314484, + "D": 0.8817278742790222 }, "sample": { "messages": [ @@ -97702,10 +97702,10 @@ ] }, "predict": { - "A": 4.522896051639691e-05, - "B": 1.379405330226291e-05, - "C": 0.996234118938446, - "D": 0.0001229450572282076 + "A": 5.380891525419429e-05, + "B": 5.00500891575939e-06, + "C": 0.9825820922851562, + "D": 4.1906423575710505e-05 }, "sample": { "messages": [ @@ -97747,10 +97747,10 @@ ] }, "predict": { - "A": 0.001911662518978119, - "B": 0.001911662518978119, - "C": 0.9902656674385071, - "D": 0.0002014877536566928 + "A": 0.000799422210548073, + "B": 0.00035474143805913627, + "C": 0.9934005737304688, + "D": 6.164481601445004e-05 }, "sample": { "messages": [ @@ -97792,10 +97792,10 @@ ] }, "predict": { - "A": 0.0006134977447800338, - "B": 0.9788920879364014, - "C": 0.0003283817204646766, - "D": 0.0004777926078531891 + "A": 0.0003008301428053528, + "B": 0.8967620134353638, + "C": 0.0002200920571340248, + "D": 0.0007216540398076177 }, "sample": { "messages": [ @@ -97837,10 +97837,10 @@ ] }, "predict": { - "A": 0.9468303322792053, - "B": 0.0034148034173995256, - "C": 0.002346957800909877, - "D": 0.004384694620966911 + "A": 0.9120612740516663, + "B": 0.0008853317121975124, + "C": 0.001136788516305387, + "D": 0.0006084790220484138 }, "sample": { "messages": [ @@ -97882,10 +97882,10 @@ ] }, "predict": { - "A": 0.0014763720100745559, - "B": 0.006616640370339155, - "C": 0.9819965362548828, - "D": 0.00019980523211415857 + "A": 0.001017118222080171, + "B": 0.001017118222080171, + "C": 0.9843420386314392, + "D": 6.108269735705107e-05 }, "sample": { "messages": [ @@ -97927,10 +97927,10 @@ ] }, "predict": { - "A": 0.2298002690076828, - "B": 0.2027980089187622, - "C": 0.05127536877989769, - "D": 0.48648715019226074 + "A": 0.09563866257667542, + "B": 0.021339869126677513, + "C": 0.03986812382936478, + "D": 0.8007726669311523 }, "sample": { "messages": [ @@ -97972,10 +97972,10 @@ ] }, "predict": { - "A": 0.7679321765899658, - "B": 0.008530956692993641, - "C": 0.0035562319681048393, - "D": 0.19416366517543793 + "A": 0.6224796175956726, + "B": 0.001642485847696662, + "C": 0.0007758554420433939, + "D": 0.33318933844566345 }, "sample": { "messages": [ @@ -98017,10 +98017,10 @@ ] }, "predict": { - "A": 0.017679426819086075, - "B": 0.9652639627456665, - "C": 0.0016444429056718946, - "D": 0.0016444429056718946 + "A": 0.021894605830311775, + "B": 0.9309823513031006, + "C": 0.0014899467350915074, + "D": 0.0014899467350915074 }, "sample": { "messages": [ @@ -98062,10 +98062,10 @@ ] }, "predict": { - "A": 0.9536768198013306, - "B": 0.005670769605785608, - "C": 0.00126531976275146, - "D": 0.0004654851509258151 + "A": 0.922839879989624, + "B": 0.0006156699382700026, + "C": 0.00024109979858621955, + "D": 0.00015566573711112142 }, "sample": { "messages": [ @@ -98107,10 +98107,10 @@ ] }, "predict": { - "A": 0.0009013119852170348, - "B": 0.0010213202331215143, - "C": 6.13350493949838e-05, - "D": 0.988408625125885 + "A": 0.0005760803469456732, + "B": 0.0002255962899653241, + "C": 1.6342140952474438e-05, + "D": 0.9784716963768005 }, "sample": { "messages": [ @@ -98152,10 +98152,10 @@ ] }, "predict": { - "A": 0.03882777318358421, - "B": 0.07253982871770859, - "C": 0.09314299374818802, - "D": 0.779876708984375 + "A": 0.013796416111290455, + "B": 0.0039527397602796555, + "C": 0.0016477471217513084, + "D": 0.9672034978866577 }, "sample": { "messages": [ @@ -98190,17 +98190,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "D" ] }, "predict": { - "A": 0.8283833265304565, - "B": 0.004925746936351061, - "C": 0.0006262385286390781, - "D": 0.1270367056131363 + "A": 0.28444918990135193, + "B": 0.0010920478962361813, + "C": 0.00045523326843976974, + "D": 0.6823581457138062 }, "sample": { "messages": [ @@ -98230,7 +98230,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -98242,10 +98242,10 @@ ] }, "predict": { - "A": 0.7252795100212097, - "B": 0.02812213823199272, - "C": 0.20779606699943542, - "D": 0.0020371610298752785 + "A": 0.4453596770763397, + "B": 0.005606251303106546, + "C": 0.4453596770763397, + "D": 0.000629003974609077 }, "sample": { "messages": [ @@ -98275,7 +98275,7 @@ "prompt_len": 68, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -98287,10 +98287,10 @@ ] }, "predict": { - "A": 0.0003328360617160797, - "B": 0.00035430214484222233, - "C": 0.9921703934669495, - "D": 7.905549136921763e-05 + "A": 0.00013012645649723709, + "B": 8.943452121457085e-05, + "C": 0.9905411601066589, + "D": 2.727598803176079e-05 }, "sample": { "messages": [ @@ -98332,10 +98332,10 @@ ] }, "predict": { - "A": 0.0019121939549222589, - "B": 0.0027822258416563272, - "C": 0.9905409216880798, - "D": 0.00013012642739340663 + "A": 0.00031355631654150784, + "B": 0.00037822109879925847, + "C": 0.9949809908866882, + "D": 5.4487911256728694e-05 }, "sample": { "messages": [ @@ -98377,10 +98377,10 @@ ] }, "predict": { - "A": 0.00010795124399010092, - "B": 0.0002934419026132673, - "C": 0.9912079572677612, - "D": 3.092854967690073e-05 + "A": 1.7542022760608234e-05, + "B": 5.0759525038301945e-05, + "C": 0.9866782426834106, + "D": 9.389567821926903e-06 }, "sample": { "messages": [ @@ -98422,10 +98422,10 @@ ] }, "predict": { - "A": 0.01888459362089634, - "B": 0.9099107384681702, - "C": 0.02139904908835888, - "D": 0.027476923540234566 + "A": 0.03788752853870392, + "B": 0.8623161315917969, + "C": 0.015793871134519577, + "D": 0.01393804233521223 }, "sample": { "messages": [ @@ -98467,10 +98467,10 @@ ] }, "predict": { - "A": 0.013940726406872272, - "B": 0.002745092147961259, - "C": 0.9773204326629639, - "D": 0.0003715078055392951 + "A": 0.025639090687036514, + "B": 0.0006418621051125228, + "C": 0.9620998501777649, + "D": 0.00028482460766099393 }, "sample": { "messages": [ @@ -98512,10 +98512,10 @@ ] }, "predict": { - "A": 0.15981480479240417, - "B": 0.003316850634291768, - "C": 0.8116065859794617, - "D": 0.0017753822030499578 + "A": 0.12629766762256622, + "B": 0.0002595363766886294, + "C": 0.8235642313957214, + "D": 0.00020212715025991201 }, "sample": { "messages": [ @@ -98557,10 +98557,10 @@ ] }, "predict": { - "A": 0.0731315165758133, - "B": 0.014400449581444263, - "C": 0.0032131746411323547, - "D": 0.8909242749214172 + "A": 0.04587973281741142, + "B": 0.0031221553217619658, + "C": 0.0017789504490792751, + "D": 0.9215191602706909 }, "sample": { "messages": [ @@ -98602,10 +98602,10 @@ ] }, "predict": { - "A": 0.013600999489426613, - "B": 0.0064246575348079205, - "C": 0.0005975861568003893, - "D": 0.9535036683082581 + "A": 0.010540026240050793, + "B": 0.0017206119373440742, + "C": 0.00031828120700083673, + "D": 0.9487829208374023 }, "sample": { "messages": [ @@ -98647,10 +98647,10 @@ ] }, "predict": { - "A": 0.010479503311216831, - "B": 0.03227913752198219, - "C": 0.9433347582817078, - "D": 0.00014042541442904621 + "A": 0.0031004496850073338, + "B": 0.004511128179728985, + "C": 0.9741322994232178, + "D": 7.29155945009552e-05 }, "sample": { "messages": [ @@ -98692,10 +98692,10 @@ ] }, "predict": { - "A": 0.002162898425012827, - "B": 0.0011577160330489278, - "C": 0.9887575507164001, - "D": 0.00029271646053530276 + "A": 0.002019303385168314, + "B": 0.00045056751696392894, + "C": 0.9826496839523315, + "D": 8.872200851328671e-05 }, "sample": { "messages": [ @@ -98737,10 +98737,10 @@ ] }, "predict": { - "A": 0.8015095591545105, - "B": 0.02742614969611168, - "C": 0.0017532950732856989, - "D": 0.1392814666032791 + "A": 0.7777736186981201, + "B": 0.0046248105354607105, + "C": 0.0011693353299051523, + "D": 0.1351567804813385 }, "sample": { "messages": [ @@ -98782,10 +98782,10 @@ ] }, "predict": { - "A": 0.013601765036582947, - "B": 0.005670059006661177, - "C": 0.001116500818170607, - "D": 0.9535573720932007 + "A": 0.02462758496403694, + "B": 0.0016759282443672419, + "C": 0.0005791852599941194, + "D": 0.9241433143615723 }, "sample": { "messages": [ @@ -98827,10 +98827,10 @@ ] }, "predict": { - "A": 0.2306506633758545, - "B": 0.12345841526985168, - "C": 0.06608252227306366, - "D": 0.5533022284507751 + "A": 0.057615961879491806, + "B": 0.006881244946271181, + "C": 0.008835694752633572, + "D": 0.9012653231620789 }, "sample": { "messages": [ @@ -98872,10 +98872,10 @@ ] }, "predict": { - "A": 0.005160814616829157, - "B": 0.9834771752357483, - "C": 6.49650683044456e-05, - "D": 0.0006163713987916708 + "A": 0.0010940321953967214, + "B": 0.9343678951263428, + "C": 1.8823866412276402e-05, + "D": 0.00040247198194265366 }, "sample": { "messages": [ @@ -98917,10 +98917,10 @@ ] }, "predict": { - "A": 0.11544471234083176, - "B": 0.752794086933136, - "C": 0.025759195908904076, - "D": 0.07002075761556625 + "A": 0.26896196603775024, + "B": 0.5693924427032471, + "C": 0.03212292492389679, + "D": 0.06800423562526703 }, "sample": { "messages": [ @@ -98962,10 +98962,10 @@ ] }, "predict": { - "A": 0.409294456243515, - "B": 0.02309081330895424, - "C": 0.5255444645881653, - "D": 0.0035410907585173845 + "A": 0.04664148390293121, + "B": 0.0013231171760708094, + "C": 0.936819314956665, + "D": 0.00042955324170179665 }, "sample": { "messages": [ @@ -99007,10 +99007,10 @@ ] }, "predict": { - "A": 0.20001736283302307, - "B": 0.32977285981178284, - "C": 0.4234367609024048, - "D": 0.0127866817638278 + "A": 0.3016771376132965, + "B": 0.12575773894786835, + "C": 0.4973815381526947, + "D": 0.007094766013324261 }, "sample": { "messages": [ @@ -99052,10 +99052,10 @@ ] }, "predict": { - "A": 0.004051122348755598, - "B": 0.0003325363795738667, - "C": 0.9912769794464111, - "D": 0.00020169350318610668 + "A": 0.0016893799183890224, + "B": 4.502044612308964e-05, + "C": 0.9916412830352783, + "D": 6.153564754640684e-05 }, "sample": { "messages": [ @@ -99097,10 +99097,10 @@ ] }, "predict": { - "A": 0.9784639477729797, - "B": 0.0024253695737570524, - "C": 0.00042146604391746223, - "D": 0.00010656330414349213 + "A": 0.9133157730102539, + "B": 0.0010693827643990517, + "C": 0.00034717778908088803, + "D": 3.437525447225198e-05 }, "sample": { "messages": [ @@ -99142,10 +99142,10 @@ ] }, "predict": { - "A": 0.9781829118728638, - "B": 0.003997609950602055, - "C": 0.00015500414883717895, - "D": 0.0003718356601893902 + "A": 0.9139316082000732, + "B": 0.000649050809442997, + "C": 3.2314339478034526e-05, + "D": 7.282147998921573e-05 }, "sample": { "messages": [ @@ -99187,10 +99187,10 @@ ] }, "predict": { - "A": 0.0011564857559278607, - "B": 0.0007948402781039476, - "C": 0.9877068400382996, - "D": 0.0027742653619498014 + "A": 0.00022851224639452994, + "B": 4.499673377722502e-05, + "C": 0.9911189675331116, + "D": 0.0002146673941751942 }, "sample": { "messages": [ @@ -99232,10 +99232,10 @@ ] }, "predict": { - "A": 0.019615385681390762, - "B": 0.945122241973877, - "C": 0.00031705317087471485, - "D": 0.0038624985609203577 + "A": 0.01797914318740368, + "B": 0.8662835955619812, + "C": 0.0004791279206983745, + "D": 0.003768633119761944 }, "sample": { "messages": [ @@ -99277,10 +99277,10 @@ ] }, "predict": { - "A": 0.05942278727889061, - "B": 0.9295288324356079, - "C": 0.0007480225176550448, - "D": 0.0004536986234597862 + "A": 0.279925674200058, + "B": 0.6715068221092224, + "C": 0.0009484029142186046, + "D": 0.0005076435045339167 }, "sample": { "messages": [ @@ -99315,17 +99315,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.029821852222085, - "B": 0.5286049246788025, - "C": 0.4116779565811157, - "D": 0.008544103242456913 + "A": 0.020094525068998337, + "B": 0.1906515508890152, + "C": 0.7540415525436401, + "D": 0.005757177714258432 }, "sample": { "messages": [ @@ -99355,7 +99355,7 @@ "prompt_len": 69, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -99367,10 +99367,10 @@ ] }, "predict": { - "A": 0.00049551174743101, - "B": 0.6157467365264893, - "C": 0.0006362496642395854, - "D": 0.3734692931175232 + "A": 0.0004867402894888073, + "B": 0.7295846343040466, + "C": 0.0009679985814727843, + "D": 0.236861452460289 }, "sample": { "messages": [ @@ -99412,10 +99412,10 @@ ] }, "predict": { - "A": 0.051620058715343475, - "B": 0.018989956006407738, - "C": 0.001071340055204928, - "D": 0.9149873852729797 + "A": 0.11329600214958191, + "B": 0.01194131001830101, + "C": 0.0014261862961575389, + "D": 0.837150514125824 }, "sample": { "messages": [ @@ -99457,10 +99457,10 @@ ] }, "predict": { - "A": 0.059142015874385834, - "B": 0.9251368641853333, - "C": 0.0022931850980967283, - "D": 0.00045155492261983454 + "A": 0.0812954381108284, + "B": 0.8740083575248718, + "C": 0.0010893596336245537, + "D": 0.00021450762869790196 }, "sample": { "messages": [ @@ -99502,10 +99502,10 @@ ] }, "predict": { - "A": 0.072098508477211, - "B": 0.7751320004463196, - "C": 0.05615037679672241, - "D": 0.0031677873339504004 + "A": 0.06123717874288559, + "B": 0.746021568775177, + "C": 0.04769156128168106, + "D": 0.00416724244132638 }, "sample": { "messages": [ @@ -99547,10 +99547,10 @@ ] }, "predict": { - "A": 0.040766239166259766, - "B": 0.0026060985401272774, - "C": 0.005517110228538513, - "D": 0.9278352856636047 + "A": 0.07158860564231873, + "B": 0.0007952775922603905, + "C": 0.005876350682228804, + "D": 0.8721277713775635 }, "sample": { "messages": [ @@ -99592,10 +99592,10 @@ ] }, "predict": { - "A": 0.9479932188987732, - "B": 0.004974612034857273, - "C": 0.004390079993754625, - "D": 0.015322882682085037 + "A": 0.8974705934524536, + "B": 0.000320485036354512, + "C": 0.006047109141945839, + "D": 0.0010508300038054585 }, "sample": { "messages": [ @@ -99637,10 +99637,10 @@ ] }, "predict": { - "A": 0.008467350155115128, - "B": 0.0051357075572013855, - "C": 0.9786925911903381, - "D": 0.00054129958152771 + "A": 0.0019042944768443704, + "B": 0.0008995250100269914, + "C": 0.9864489436149597, + "D": 9.480924200033769e-05 }, "sample": { "messages": [ @@ -99682,10 +99682,10 @@ ] }, "predict": { - "A": 0.47031447291374207, - "B": 0.47031447291374207, - "C": 0.026533327996730804, - "D": 0.0040690177120268345 + "A": 0.8548248410224915, + "B": 0.02278031036257744, + "C": 0.01774132251739502, + "D": 0.0006462276796810329 }, "sample": { "messages": [ @@ -99727,10 +99727,10 @@ ] }, "predict": { - "A": 0.9665723443031311, - "B": 0.006512713152915239, - "C": 0.0011317398166283965, - "D": 0.0012824293226003647 + "A": 0.951306164264679, + "B": 0.0003849414351861924, + "C": 0.00040976802119985223, + "D": 0.000248536845901981 }, "sample": { "messages": [ @@ -99772,10 +99772,10 @@ ] }, "predict": { - "A": 0.9626694321632385, - "B": 0.001277250936254859, - "C": 0.0014473148621618748, - "D": 0.0005324374069459736 + "A": 0.9316577315330505, + "B": 0.0001672886428423226, + "C": 0.00031253634369932115, + "D": 6.551117985509336e-05 }, "sample": { "messages": [ @@ -99817,10 +99817,10 @@ ] }, "predict": { - "A": 0.0008740659104660153, - "B": 0.9585297107696533, - "C": 0.00019503047224134207, - "D": 0.0002837676729541272 + "A": 0.0008103930740617216, + "B": 0.8887039422988892, + "C": 0.0001242778089363128, + "D": 0.00014990764611866325 }, "sample": { "messages": [ @@ -99862,10 +99862,10 @@ ] }, "predict": { - "A": 0.003890705294907093, - "B": 0.9520242214202881, - "C": 0.02238946408033371, - "D": 0.0012631270801648498 + "A": 0.005987134296447039, + "B": 0.8885695338249207, + "C": 0.030405176803469658, + "D": 0.0009773727506399155 }, "sample": { "messages": [ @@ -99907,10 +99907,10 @@ ] }, "predict": { - "A": 0.0653909370303154, - "B": 0.014590688981115818, - "C": 0.9026939868927002, - "D": 0.0004690169298555702 + "A": 0.11457489430904388, + "B": 0.005704347975552082, + "C": 0.846600353717804, + "D": 0.00020778088946826756 }, "sample": { "messages": [ @@ -99952,10 +99952,10 @@ ] }, "predict": { - "A": 0.005102140363305807, - "B": 0.003506646491587162, - "C": 0.0027309791184961796, - "D": 0.972295880317688 + "A": 0.0034791671205312014, + "B": 0.00041552726179361343, + "C": 0.0012023680610582232, + "D": 0.9646766185760498 }, "sample": { "messages": [ @@ -99997,10 +99997,10 @@ ] }, "predict": { - "A": 0.0006992669077590108, - "B": 0.9846411347389221, - "C": 0.00022701872512698174, - "D": 0.0005445896531455219 + "A": 0.00032435983303003013, + "B": 0.9083213806152344, + "C": 0.00014393380843102932, + "D": 0.000391252659028396 }, "sample": { "messages": [ @@ -100042,10 +100042,10 @@ ] }, "predict": { - "A": 0.02410757727921009, - "B": 0.004747063387185335, - "C": 0.1387295126914978, - "D": 0.7983333468437195 + "A": 0.008473703637719154, + "B": 0.0008390101720578969, + "C": 0.08039625734090805, + "D": 0.8643412590026855 }, "sample": { "messages": [ @@ -100080,17 +100080,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "D" + "A" ] }, "predict": { - "A": 0.3133193254470825, - "B": 0.04804907366633415, - "C": 0.020029833540320396, - "D": 0.5853575468063354 + "A": 0.4378132224082947, + "B": 0.013220814056694508, + "C": 0.019236169755458832, + "D": 0.4378132224082947 }, "sample": { "messages": [ @@ -100120,7 +100120,7 @@ "prompt_len": 94, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " D" + "generated_token": " A" } } { @@ -100128,14 +100128,14 @@ "acc": false, "f1_macro": [ "A", - "B" + "C" ] }, "predict": { - "A": 0.04242346063256264, - "B": 0.4025025963783264, - "C": 0.2766357362270355, - "D": 0.21544411778450012 + "A": 0.07620973140001297, + "B": 0.052378132939338684, + "C": 0.7230578064918518, + "D": 0.09785523265600204 }, "sample": { "messages": [ @@ -100165,7 +100165,7 @@ "prompt_len": 80, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -100177,10 +100177,10 @@ ] }, "predict": { - "A": 0.05907273665070534, - "B": 0.027903985232114792, - "C": 0.16057635843753815, - "D": 0.7196532487869263 + "A": 0.009435545653104782, + "B": 0.013728638179600239, + "C": 0.35406622290611267, + "D": 0.5837565064430237 }, "sample": { "messages": [ @@ -100222,10 +100222,10 @@ ] }, "predict": { - "A": 0.9402565360069275, - "B": 0.04131199046969414, - "C": 0.0005892838817089796, - "D": 0.0006677461205981672 + "A": 0.9128771424293518, + "B": 0.0013724552700296044, + "C": 0.0001744881010381505, + "D": 0.00013589147420134395 }, "sample": { "messages": [ @@ -100267,10 +100267,10 @@ ] }, "predict": { - "A": 0.0014640060253441334, - "B": 0.9737713932991028, - "C": 0.0002245125942863524, - "D": 0.00013617378135677427 + "A": 0.0015270309522747993, + "B": 0.8963448405265808, + "C": 0.00019414017151575536, + "D": 0.00015119652380235493 }, "sample": { "messages": [ @@ -100312,10 +100312,10 @@ ] }, "predict": { - "A": 0.07088472694158554, - "B": 0.8635528087615967, - "C": 0.037941861897706985, - "D": 0.00453150924295187 + "A": 0.05050811916589737, + "B": 0.6972430348396301, + "C": 0.13729529082775116, + "D": 0.0023622962180525064 }, "sample": { "messages": [ @@ -100357,10 +100357,10 @@ ] }, "predict": { - "A": 0.3463747799396515, - "B": 0.09923802316188812, - "C": 0.5039722919464111, - "D": 0.025091303512454033 + "A": 0.21636950969696045, + "B": 0.005766051821410656, + "C": 0.7552037835121155, + "D": 0.001208630157634616 }, "sample": { "messages": [ @@ -100402,10 +100402,10 @@ ] }, "predict": { - "A": 0.1670336276292801, - "B": 0.7485927939414978, - "C": 0.00392825435847044, - "D": 0.054227881133556366 + "A": 0.35742515325546265, + "B": 0.5200505256652832, + "C": 0.003504073014482856, + "D": 0.05481291934847832 }, "sample": { "messages": [ @@ -100447,10 +100447,10 @@ ] }, "predict": { - "A": 0.0024238708429038525, - "B": 0.0021390586625784636, - "C": 0.00025547409313730896, - "D": 0.9778593182563782 + "A": 0.0038732809480279684, + "B": 0.0001410842378390953, + "C": 0.0001928393030539155, + "D": 0.9477605819702148 }, "sample": { "messages": [ @@ -100492,10 +100492,10 @@ ] }, "predict": { - "A": 0.0018981907051056623, - "B": 0.9832870960235596, - "C": 0.00012134726421209052, - "D": 0.0004235435917507857 + "A": 0.024976428598165512, + "B": 0.937233567237854, + "C": 0.00023002497619017959, + "D": 0.001699667307548225 }, "sample": { "messages": [ @@ -100537,10 +100537,10 @@ ] }, "predict": { - "A": 0.02336515672504902, - "B": 0.1726464480161667, - "C": 0.7737476825714111, - "D": 0.004600871820002794 + "A": 0.011120290495455265, + "B": 0.0725134015083313, + "C": 0.8833941221237183, + "D": 0.002057046862319112 }, "sample": { "messages": [ @@ -100582,10 +100582,10 @@ ] }, "predict": { - "A": 0.028244785964488983, - "B": 0.9353388547897339, - "C": 0.0055617280304431915, - "D": 0.011774178594350815 + "A": 0.04513688012957573, + "B": 0.9065985083580017, + "C": 0.0012028571218252182, + "D": 0.0022472331766039133 }, "sample": { "messages": [ @@ -100627,10 +100627,10 @@ ] }, "predict": { - "A": 0.9573980569839478, - "B": 0.02251584455370903, - "C": 0.00041239208076149225, - "D": 0.001848213141784072 + "A": 0.9525296092033386, + "B": 0.004411087371408939, + "C": 0.00015093910042196512, + "D": 0.0005608067731373012 }, "sample": { "messages": [ @@ -100672,10 +100672,10 @@ ] }, "predict": { - "A": 0.17113536596298218, - "B": 0.766975462436676, - "C": 0.03369855135679245, - "D": 0.0003985010553151369 + "A": 0.257637083530426, + "B": 0.6180391907691956, + "C": 0.034867387264966965, + "D": 0.0003211175207979977 }, "sample": { "messages": [ @@ -100717,10 +100717,10 @@ ] }, "predict": { - "A": 0.0024353028275072575, - "B": 0.002759559778496623, - "C": 0.9824712872505188, - "D": 0.000543389527592808 + "A": 0.001673807855695486, + "B": 0.0010152156464755535, + "C": 0.9825007319450378, + "D": 0.0006977468729019165 }, "sample": { "messages": [ @@ -100762,10 +100762,10 @@ ] }, "predict": { - "A": 0.08951079845428467, - "B": 0.010690541006624699, - "C": 0.849254846572876, - "D": 0.02905990183353424 + "A": 0.08869501203298569, + "B": 0.00323071563616395, + "C": 0.8415148258209229, + "D": 0.006035770755261183 }, "sample": { "messages": [ @@ -100807,10 +100807,10 @@ ] }, "predict": { - "A": 0.030369840562343597, - "B": 0.1542307734489441, - "C": 0.7832484841346741, - "D": 0.006776427384465933 + "A": 0.023348860442638397, + "B": 0.07191955298185349, + "C": 0.8761595487594604, + "D": 0.0026196695398539305 }, "sample": { "messages": [ @@ -100852,10 +100852,10 @@ ] }, "predict": { - "A": 0.0016623921692371368, - "B": 0.0012946721399202943, - "C": 0.012283507734537125, - "D": 0.9757998585700989 + "A": 5.6513054005336016e-05, + "B": 7.25642021279782e-05, + "C": 0.000884012901224196, + "D": 0.9694378972053528 }, "sample": { "messages": [ @@ -100890,17 +100890,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "D", - "C" + "D" ] }, "predict": { - "A": 0.006097785197198391, - "B": 0.002107338048517704, - "C": 0.48440706729888916, - "D": 0.48440706729888916 + "A": 0.0017165470635518432, + "B": 0.0004620023537427187, + "C": 0.307297021150589, + "D": 0.6505478024482727 }, "sample": { "messages": [ @@ -100930,7 +100930,7 @@ "prompt_len": 100, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " D" } } { @@ -100942,10 +100942,10 @@ ] }, "predict": { - "A": 0.00017503004346508533, - "B": 0.9747708439826965, - "C": 0.00013631355250254273, - "D": 0.00019833503756672144 + "A": 0.0002546875912230462, + "B": 0.8081780076026917, + "C": 0.00012030589277856052, + "D": 0.0003270253073424101 }, "sample": { "messages": [ @@ -100987,10 +100987,10 @@ ] }, "predict": { - "A": 0.15805122256278992, - "B": 0.058143794536590576, - "C": 0.7083364129066467, - "D": 0.005408214870840311 + "A": 0.09139442443847656, + "B": 0.01091550663113594, + "C": 0.7652361392974854, + "D": 0.002435578964650631 }, "sample": { "messages": [ @@ -101032,10 +101032,10 @@ ] }, "predict": { - "A": 0.798958957195282, - "B": 0.07431475818157196, - "C": 0.018789714202284813, - "D": 0.06558254361152649 + "A": 0.8496159315109253, + "B": 0.02565617859363556, + "C": 0.008329342119395733, + "D": 0.04793205112218857 }, "sample": { "messages": [ @@ -101077,10 +101077,10 @@ ] }, "predict": { - "A": 0.9415600299835205, - "B": 0.00923073012381792, - "C": 0.00013166929420549423, - "D": 0.0009729117737151682 + "A": 0.8490430116653442, + "B": 0.002384787192568183, + "C": 0.0001187315647257492, + "D": 0.00046959242899902165 }, "sample": { "messages": [ @@ -101118,14 +101118,14 @@ "acc": false, "f1_macro": [ "C", - "B" + "A" ] }, "predict": { - "A": 0.08480025827884674, - "B": 0.8045624494552612, - "C": 0.00614290963858366, - "D": 0.08480025827884674 + "A": 0.7559928894042969, + "B": 0.11593526601791382, + "C": 0.029313024133443832, + "D": 0.022829007357358932 }, "sample": { "messages": [ @@ -101155,7 +101155,7 @@ "prompt_len": 71, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -101167,10 +101167,10 @@ ] }, "predict": { - "A": 0.003091932274401188, - "B": 0.9714562296867371, - "C": 0.00025380123406648636, - "D": 0.0011374582536518574 + "A": 0.0028663889970630407, + "B": 0.9005926251411438, + "C": 0.00025046226801350713, + "D": 0.0004679251287598163 }, "sample": { "messages": [ @@ -101212,10 +101212,10 @@ ] }, "predict": { - "A": 0.5775996446609497, - "B": 0.017441999167203903, - "C": 0.3503319025039673, - "D": 0.008239015936851501 + "A": 0.772417426109314, + "B": 0.0031566936522722244, + "C": 0.11845405399799347, + "D": 0.002458435483276844 }, "sample": { "messages": [ @@ -101257,10 +101257,10 @@ ] }, "predict": { - "A": 0.9553748965263367, - "B": 0.003904398763552308, - "C": 0.012026394717395306, - "D": 0.0005987589247524738 + "A": 0.8785414695739746, + "B": 0.00026008757413364947, + "C": 0.0016959839267656207, + "D": 6.576043961104006e-05 }, "sample": { "messages": [ @@ -101302,10 +101302,10 @@ ] }, "predict": { - "A": 0.004009198863059282, - "B": 0.9810186624526978, - "C": 0.0008945732261054218, - "D": 0.00047883056686259806 + "A": 0.0012304858537390828, + "B": 0.9274224042892456, + "C": 0.0003111155529040843, + "D": 0.00015643872029613703 }, "sample": { "messages": [ @@ -101340,17 +101340,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "B" + "A" ] }, "predict": { - "A": 0.12864546477794647, - "B": 0.8388738036155701, - "C": 0.006404879968613386, - "D": 0.0044020055793225765 + "A": 0.5030350685119629, + "B": 0.3917641043663025, + "C": 0.0049315826036036015, + "D": 0.0019312354270368814 }, "sample": { "messages": [ @@ -101380,7 +101380,7 @@ "prompt_len": 101, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -101392,10 +101392,10 @@ ] }, "predict": { - "A": 0.8090908527374268, - "B": 0.14059889316558838, - "C": 0.008988188579678535, - "D": 0.003306569531559944 + "A": 0.8715062141418457, + "B": 0.029821300879120827, + "C": 0.0027738132048398256, + "D": 0.0015804710565134883 }, "sample": { "messages": [ @@ -101437,10 +101437,10 @@ ] }, "predict": { - "A": 0.0004222690768074244, - "B": 0.9803282618522644, - "C": 0.0003726511786226183, - "D": 0.0006143978680483997 + "A": 0.00024739941000007093, + "B": 0.8895794749259949, + "C": 0.0005575231625698507, + "D": 0.0005575231625698507 }, "sample": { "messages": [ @@ -101475,17 +101475,17 @@ } { "metric": { - "acc": true, + "acc": false, "f1_macro": [ "A", - "A" + "D" ] }, "predict": { - "A": 0.43082958459854126, - "B": 0.13986988365650177, - "C": 0.05145525932312012, - "D": 0.33553043007850647 + "A": 0.3736635446548462, + "B": 0.01641763001680374, + "C": 0.0186036117374897, + "D": 0.5436772108078003 }, "sample": { "messages": [ @@ -101515,7 +101515,7 @@ "prompt_len": 95, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " D" } } { @@ -101527,10 +101527,10 @@ ] }, "predict": { - "A": 0.006430476903915405, - "B": 0.01980726234614849, - "C": 0.9543673992156982, - "D": 0.004419598262757063 + "A": 0.003100961446762085, + "B": 0.003100961446762085, + "C": 0.9742931127548218, + "D": 0.0009457397391088307 }, "sample": { "messages": [ @@ -101572,10 +101572,10 @@ ] }, "predict": { - "A": 0.0908932164311409, - "B": 0.008454387076199055, - "C": 0.8623708486557007, - "D": 0.010855646803975105 + "A": 0.007382006384432316, + "B": 0.0006059519946575165, + "C": 0.9668521285057068, + "D": 0.0005347507540136576 }, "sample": { "messages": [ @@ -101617,10 +101617,10 @@ ] }, "predict": { - "A": 0.865660548210144, - "B": 0.029621273279190063, - "C": 0.0021457578986883163, - "D": 0.0710577443242073 + "A": 0.938972532749176, + "B": 0.011819920502603054, + "C": 0.0011703306809067726, + "D": 0.013393723405897617 }, "sample": { "messages": [ @@ -101662,10 +101662,10 @@ ] }, "predict": { - "A": 0.0006534856511279941, - "B": 0.0005417586071416736, - "C": 0.002280889078974724, - "D": 0.9795225858688354 + "A": 0.00044392916606739163, + "B": 0.00014412269229069352, + "C": 0.00113361282274127, + "D": 0.9681719541549683 }, "sample": { "messages": [ @@ -101707,10 +101707,10 @@ ] }, "predict": { - "A": 0.013737496919929981, - "B": 0.0009951409883797169, - "C": 0.001859168172813952, - "D": 0.9630729556083679 + "A": 0.00698438286781311, + "B": 0.0002882799308281392, + "C": 0.001071089762263, + "D": 0.9147736430168152 }, "sample": { "messages": [ @@ -101752,10 +101752,10 @@ ] }, "predict": { - "A": 0.9022759795188904, - "B": 0.05090293288230896, - "C": 0.0032541153486818075, - "D": 0.014583933167159557 + "A": 0.9203187823295593, + "B": 0.0025849861558526754, + "C": 0.00022587357671000063, + "D": 0.0008392221061512828 }, "sample": { "messages": [ @@ -101797,10 +101797,10 @@ ] }, "predict": { - "A": 0.05366604030132294, - "B": 0.05366604030132294, - "C": 0.011974512599408627, - "D": 0.8394781351089478 + "A": 0.035429276525974274, + "B": 0.011502202600240707, + "C": 0.00397505471482873, + "D": 0.9137330651283264 }, "sample": { "messages": [ @@ -101842,10 +101842,10 @@ ] }, "predict": { - "A": 0.2367085963487625, - "B": 0.087080217897892, - "C": 0.6434406042098999, - "D": 0.004912729375064373 + "A": 0.13974301517009735, + "B": 0.007883762940764427, + "C": 0.8041656613349915, + "D": 0.0010023079812526703 }, "sample": { "messages": [ @@ -101887,10 +101887,10 @@ ] }, "predict": { - "A": 0.019569119438529015, - "B": 0.9428930282592773, - "C": 0.006353163160383701, - "D": 0.003001021919772029 + "A": 0.048592157661914825, + "B": 0.8613165020942688, + "C": 0.012286020442843437, + "D": 0.0024192610289901495 }, "sample": { "messages": [ @@ -101932,10 +101932,10 @@ ] }, "predict": { - "A": 0.010379094630479813, - "B": 0.046515870839357376, - "C": 0.9342962503433228, - "D": 0.0002293040743097663 + "A": 0.004025997593998909, + "B": 0.005169483367353678, + "C": 0.9851291179656982, + "D": 3.4831704397220165e-05 }, "sample": { "messages": [ @@ -101977,10 +101977,10 @@ ] }, "predict": { - "A": 0.012231605127453804, - "B": 0.007418843451887369, - "C": 0.971676766872406, - "D": 0.0005374192260205746 + "A": 0.032840631902217865, + "B": 0.0014429166913032532, + "C": 0.9597440361976624, + "D": 0.00014286809891927987 }, "sample": { "messages": [ @@ -102022,10 +102022,10 @@ ] }, "predict": { - "A": 0.004529556259512901, - "B": 0.0007871187990531325, - "C": 0.0007871187990531325, - "D": 0.978111743927002 + "A": 0.0005291240522637963, + "B": 0.00013378352741710842, + "C": 0.00019465386867523193, + "D": 0.9566787481307983 }, "sample": { "messages": [ @@ -102067,10 +102067,10 @@ ] }, "predict": { - "A": 0.9598732590675354, - "B": 0.004445095546543598, - "C": 0.0008752911235205829, - "D": 0.0003648756246548146 + "A": 0.9029192328453064, + "B": 0.0006412300863303244, + "C": 0.0005315984017215669, + "D": 0.00011142907897010446 }, "sample": { "messages": [ @@ -102112,10 +102112,10 @@ ] }, "predict": { - "A": 0.17115584015846252, - "B": 0.7670671939849854, - "C": 0.01803969219326973, - "D": 0.015919972211122513 + "A": 0.20988325774669647, + "B": 0.503483772277832, + "C": 0.1442505121231079, + "D": 0.025066979229450226 }, "sample": { "messages": [ @@ -102150,17 +102150,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "C", - "B" + "C" ] }, "predict": { - "A": 0.1492040455341339, - "B": 0.6686861515045166, - "C": 0.13167209923267365, - "D": 0.020192570984363556 + "A": 0.31871309876441956, + "B": 0.19330927729606628, + "C": 0.40923571586608887, + "D": 0.029644938185811043 }, "sample": { "messages": [ @@ -102190,7 +102190,7 @@ "prompt_len": 98, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " C" } } { @@ -102202,10 +102202,10 @@ ] }, "predict": { - "A": 0.013787715695798397, - "B": 0.0034860805608332157, - "C": 0.0003242559905629605, - "D": 0.9665935039520264 + "A": 0.003848377149552107, + "B": 0.0005208211950957775, + "C": 0.00010255577944917604, + "D": 0.9416667819023132 }, "sample": { "messages": [ @@ -102247,10 +102247,10 @@ ] }, "predict": { - "A": 0.07168181985616684, - "B": 0.8732633590698242, - "C": 0.006667447742074728, - "D": 0.015994377434253693 + "A": 0.1646653413772583, + "B": 0.7379788756370544, + "C": 0.004388182424008846, + "D": 0.009289783425629139 }, "sample": { "messages": [ @@ -102292,10 +102292,10 @@ ] }, "predict": { - "A": 0.001122218556702137, - "B": 0.9584406614303589, - "C": 0.0005642869509756565, - "D": 0.0012716402998194098 + "A": 0.0005877881776541471, + "B": 0.8276674151420593, + "C": 0.0004300350265111774, + "D": 0.0010981329251080751 }, "sample": { "messages": [ @@ -102337,10 +102337,10 @@ ] }, "predict": { - "A": 0.0031468947418034077, - "B": 0.0007956595509313047, - "C": 0.9887248873710632, - "D": 8.927052840590477e-05 + "A": 0.0021379042882472277, + "B": 0.0002718040195759386, + "C": 0.9773315787315369, + "D": 3.4556003811303526e-05 }, "sample": { "messages": [ @@ -102382,10 +102382,10 @@ ] }, "predict": { - "A": 0.5363674163818359, - "B": 0.009823912754654884, - "C": 0.012614153325557709, - "D": 0.4177233874797821 + "A": 0.8228075504302979, + "B": 0.00405610166490078, + "C": 0.021927079185843468, + "D": 0.08672327548265457 }, "sample": { "messages": [ @@ -102427,10 +102427,10 @@ ] }, "predict": { - "A": 0.0031453396659344435, - "B": 0.001021142234094441, - "C": 0.9882362484931946, - "D": 0.00022784761677030474 + "A": 0.002765604294836521, + "B": 0.0002738318871706724, + "C": 0.9846232533454895, + "D": 6.923554610693827e-05 }, "sample": { "messages": [ @@ -102472,10 +102472,10 @@ ] }, "predict": { - "A": 0.00022441511100623757, - "B": 0.9733485579490662, - "C": 4.703996819444001e-05, - "D": 4.703996819444001e-05 + "A": 0.00014936838124413043, + "B": 0.8318567872047424, + "C": 4.0201957745011896e-05, + "D": 4.0201957745011896e-05 }, "sample": { "messages": [ @@ -102517,10 +102517,10 @@ ] }, "predict": { - "A": 0.00017879693768918514, - "B": 5.804686952615157e-05, - "C": 0.9957492351531982, - "D": 0.00020260347810108215 + "A": 4.490931678446941e-05, + "B": 1.8720988009590656e-05, + "C": 0.9891934990882874, + "D": 3.963232666137628e-05 }, "sample": { "messages": [ @@ -102562,10 +102562,10 @@ ] }, "predict": { - "A": 0.0016701030544936657, - "B": 0.0007889007683843374, - "C": 8.314951992360875e-05, - "D": 0.9803260564804077 + "A": 0.00051928183529526, + "B": 0.0001158674422185868, + "C": 3.761664993362501e-05, + "D": 0.9388836622238159 }, "sample": { "messages": [ @@ -102607,10 +102607,10 @@ ] }, "predict": { - "A": 0.4498504102230072, - "B": 0.03258705511689186, - "C": 0.07817227393388748, - "D": 0.3969915807247162 + "A": 0.7424833178520203, + "B": 0.003438380314037204, + "C": 0.01978651061654091, + "D": 0.18772919476032257 }, "sample": { "messages": [ @@ -102652,10 +102652,10 @@ ] }, "predict": { - "A": 0.9177068471908569, - "B": 0.013090385124087334, - "C": 0.0025776498951017857, - "D": 0.027712346985936165 + "A": 0.8094201683998108, + "B": 0.008991846814751625, + "C": 0.021570315584540367, + "D": 0.06644125282764435 }, "sample": { "messages": [ @@ -102697,10 +102697,10 @@ ] }, "predict": { - "A": 0.0571138821542263, - "B": 0.6140321493148804, - "C": 0.03925376012921333, - "D": 0.2559666633605957 + "A": 0.0703493133187294, + "B": 0.5198150277137756, + "C": 0.054788097739219666, + "D": 0.2455432415008545 }, "sample": { "messages": [ @@ -102742,10 +102742,10 @@ ] }, "predict": { - "A": 0.017807137221097946, - "B": 0.000782390998210758, - "C": 0.972236692905426, - "D": 0.0006904575857333839 + "A": 0.036409247666597366, + "B": 0.0002453235792927444, + "C": 0.9390069246292114, + "D": 0.0002304602094227448 }, "sample": { "messages": [ @@ -102787,10 +102787,10 @@ ] }, "predict": { - "A": 0.04238585755228996, - "B": 0.08973086625337601, - "C": 0.8513427376747131, - "D": 0.0015439047710970044 + "A": 0.06433680653572083, + "B": 0.030390555039048195, + "C": 0.8881422281265259, + "D": 0.001335267792455852 }, "sample": { "messages": [ @@ -102832,10 +102832,10 @@ ] }, "predict": { - "A": 0.9721274375915527, - "B": 0.0008864654810167849, - "C": 6.42153390799649e-05, - "D": 0.0002539765846449882 + "A": 0.9235928654670715, + "B": 0.00011398041533539072, + "C": 2.5432469556108117e-05, + "D": 0.0001658405235502869 }, "sample": { "messages": [ @@ -102877,10 +102877,10 @@ ] }, "predict": { - "A": 0.9603360891342163, - "B": 0.008308535441756248, - "C": 0.00025089600239880383, - "D": 0.0011244378983974457 + "A": 0.8908822536468506, + "B": 0.001517724827863276, + "C": 0.00014117038517724723, + "D": 0.0005583396414294839 }, "sample": { "messages": [ @@ -102922,10 +102922,10 @@ ] }, "predict": { - "A": 0.23776082694530487, - "B": 0.7323549389839172, - "C": 0.0071797557175159454, - "D": 0.004934568889439106 + "A": 0.06121998280286789, + "B": 0.7458121180534363, + "C": 0.1468590945005417, + "D": 0.002229937817901373 }, "sample": { "messages": [ @@ -102967,10 +102967,10 @@ ] }, "predict": { - "A": 0.035327620804309845, - "B": 0.03117651678621769, - "C": 0.9111114144325256, - "D": 0.008391045033931732 + "A": 0.00746590131893754, + "B": 0.003112250939011574, + "C": 0.9778401255607605, + "D": 0.001297379145398736 }, "sample": { "messages": [ @@ -103012,10 +103012,10 @@ ] }, "predict": { - "A": 0.15593095123767853, - "B": 0.018623298034071922, - "C": 0.0060460991226136684, - "D": 0.7918827533721924 + "A": 0.05643314868211746, + "B": 0.0029908474534749985, + "C": 0.0013271804200485349, + "D": 0.8827629685401917 }, "sample": { "messages": [ @@ -103053,14 +103053,14 @@ "acc": false, "f1_macro": [ "B", - "A" + "C" ] }, "predict": { - "A": 0.2682592272758484, - "B": 0.2682592272758484, - "C": 0.236737921833992, - "D": 0.16270744800567627 + "A": 0.2735125422477722, + "B": 0.11401698738336563, + "C": 0.4509459137916565, + "D": 0.07836264371871948 }, "sample": { "messages": [ @@ -103090,7 +103090,7 @@ "prompt_len": 88, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -103102,10 +103102,10 @@ ] }, "predict": { - "A": 0.0007711646030656993, - "B": 0.9582862854003906, - "C": 0.0016325556207448244, - "D": 0.012063044123351574 + "A": 0.0014146368484944105, + "B": 0.8839255571365356, + "C": 0.001602993463166058, + "D": 0.01260851975530386 }, "sample": { "messages": [ @@ -103147,10 +103147,10 @@ ] }, "predict": { - "A": 0.02849961817264557, - "B": 0.013462265953421593, - "C": 0.9437777400016785, - "D": 0.0013329449575394392 + "A": 0.03633278235793114, + "B": 0.0031747231259942055, + "C": 0.9370347857475281, + "D": 0.00042965204920619726 }, "sample": { "messages": [ @@ -103192,10 +103192,10 @@ ] }, "predict": { - "A": 0.03012056276202202, - "B": 0.0035973885096609592, - "C": 0.880251944065094, - "D": 0.07225547730922699 + "A": 0.025100424885749817, + "B": 0.0007120442460291088, + "C": 0.9418864846229553, + "D": 0.01343528926372528 }, "sample": { "messages": [ @@ -103237,10 +103237,10 @@ ] }, "predict": { - "A": 0.9526640176773071, - "B": 0.010583142749965191, - "C": 0.0014322723727673292, - "D": 0.008242159150540829 + "A": 0.948564350605011, + "B": 0.00125853659119457, + "C": 0.00043493861448951066, + "D": 0.0020749762188643217 }, "sample": { "messages": [ @@ -103275,17 +103275,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "C" + "A" ] }, "predict": { - "A": 0.25610971450805664, - "B": 0.023821910843253136, - "C": 0.6961783170700073, - "D": 0.0009236757177859545 + "A": 0.6346635222434998, + "B": 0.004276329185813665, + "C": 0.2997938394546509, + "D": 0.0003097762819379568 }, "sample": { "messages": [ @@ -103315,7 +103315,7 @@ "prompt_len": 67, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -103327,10 +103327,10 @@ ] }, "predict": { - "A": 0.001282618730328977, - "B": 0.9667151570320129, - "C": 0.0018661993090063334, - "D": 0.001282618730328977 + "A": 0.0006159106851555407, + "B": 0.867266833782196, + "C": 0.0007429301040247083, + "D": 0.0004796717257704586 }, "sample": { "messages": [ @@ -103372,10 +103372,10 @@ ] }, "predict": { - "A": 0.7672638893127441, - "B": 0.13333046436309814, - "C": 0.006638133432716131, - "D": 0.06298085302114487 + "A": 0.7272917032241821, + "B": 0.07665597647428513, + "C": 0.007130117155611515, + "D": 0.07665597647428513 }, "sample": { "messages": [ @@ -103417,10 +103417,10 @@ ] }, "predict": { - "A": 0.9441482424736023, - "B": 0.007208660244941711, - "C": 0.0005221936153247952, - "D": 0.004372274037450552 + "A": 0.8920170664787292, + "B": 0.00029923839611001313, + "C": 9.126254735747352e-05, + "D": 0.0004933606833219528 }, "sample": { "messages": [ @@ -103458,14 +103458,14 @@ "acc": false, "f1_macro": [ "D", - "B" + "A" ] }, "predict": { - "A": 0.14246268570423126, - "B": 0.7234852313995361, - "C": 0.04625086113810539, - "D": 0.04625086113810539 + "A": 0.5965735912322998, + "B": 0.1331135630607605, + "C": 0.1036689355969429, + "D": 0.009642713703215122 }, "sample": { "messages": [ @@ -103495,7 +103495,7 @@ "prompt_len": 76, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " B" + "generated_token": " A" } } { @@ -103507,10 +103507,10 @@ ] }, "predict": { - "A": 0.005682113580405712, - "B": 0.009368222206830978, - "C": 0.001191035844385624, - "D": 0.9555845856666565 + "A": 0.0022994638420641422, + "B": 0.0006588074029423296, + "C": 0.00029234401881694794, + "D": 0.927669882774353 }, "sample": { "messages": [ @@ -103552,10 +103552,10 @@ ] }, "predict": { - "A": 0.028682999312877655, - "B": 0.009312006644904613, - "C": 0.9498504400253296, - "D": 0.0009814782533794641 + "A": 0.020014921203255653, + "B": 0.0014498761156573892, + "C": 0.964372992515564, + "D": 0.00022234569769352674 }, "sample": { "messages": [ @@ -103597,10 +103597,10 @@ ] }, "predict": { - "A": 0.02796175517141819, - "B": 0.02796175517141819, - "C": 0.9259661436080933, - "D": 0.00033066075411625206 + "A": 0.015611814334988594, + "B": 0.005068414378911257, + "C": 0.9658687710762024, + "D": 0.0008807583362795413 }, "sample": { "messages": [ @@ -103642,10 +103642,10 @@ ] }, "predict": { - "A": 0.0006165074883028865, - "B": 0.9836943745613098, - "C": 0.0001213975265272893, - "D": 0.001305146375671029 + "A": 0.00012920747394673526, + "B": 0.9239556193351746, + "C": 3.9406048017553985e-05, + "D": 0.0003979869943577796 }, "sample": { "messages": [ @@ -103687,10 +103687,10 @@ ] }, "predict": { - "A": 0.04053833335638046, - "B": 0.81423419713974, - "C": 0.1101946160197258, - "D": 0.007044506259262562 + "A": 0.38287344574928284, + "B": 0.4916192293167114, + "C": 0.040354564785957336, + "D": 0.01682228408753872 }, "sample": { "messages": [ @@ -103732,10 +103732,10 @@ ] }, "predict": { - "A": 0.039918579161167145, - "B": 0.3342341482639313, - "C": 0.5510589480400085, - "D": 0.01009299699217081 + "A": 0.03248829022049904, + "B": 0.39578837156295776, + "C": 0.44848698377609253, + "D": 0.015346379950642586 }, "sample": { "messages": [ @@ -103777,10 +103777,10 @@ ] }, "predict": { - "A": 0.005564772058278322, - "B": 0.04111841320991516, - "C": 0.9358507990837097, - "D": 0.009174758568406105 + "A": 0.001300509087741375, + "B": 0.0031197592616081238, + "C": 0.9801991581916809, + "D": 0.004539222922176123 }, "sample": { "messages": [ @@ -103822,10 +103822,10 @@ ] }, "predict": { - "A": 0.965977132320404, - "B": 0.007375326007604599, - "C": 0.000534266815520823, - "D": 0.005068982485681772 + "A": 0.8902012705802917, + "B": 0.0011811016593128443, + "C": 0.0002635394048411399, + "D": 0.0015165646327659488 }, "sample": { "messages": [ @@ -103867,10 +103867,10 @@ ] }, "predict": { - "A": 0.974951446056366, - "B": 0.003516223980113864, - "C": 0.0007845756481401622, - "D": 0.001007415005005896 + "A": 0.951693058013916, + "B": 0.00024863792350515723, + "C": 0.0001508065324742347, + "D": 0.00020612809748854488 }, "sample": { "messages": [ @@ -103912,10 +103912,10 @@ ] }, "predict": { - "A": 0.0039991107769310474, - "B": 0.9785500764846802, - "C": 0.0016670774202793837, - "D": 0.001011133543215692 + "A": 0.0022675280924886465, + "B": 0.9147861003875732, + "C": 0.016754893586039543, + "D": 0.0010711044305935502 }, "sample": { "messages": [ @@ -103957,10 +103957,10 @@ ] }, "predict": { - "A": 0.008224092423915863, - "B": 0.0174104031175375, - "C": 0.00440203957259655, - "D": 0.9505757689476013 + "A": 0.004122400656342506, + "B": 0.004388272762298584, + "C": 0.004972564056515694, + "D": 0.9476029872894287 }, "sample": { "messages": [ @@ -104002,10 +104002,10 @@ ] }, "predict": { - "A": 0.8407531380653381, - "B": 0.08861473202705383, - "C": 0.013589508831501007, - "D": 0.02876899018883705 + "A": 0.9081594347953796, + "B": 0.008903282694518566, + "C": 0.008903282694518566, + "D": 0.006119130179286003 }, "sample": { "messages": [ @@ -104047,10 +104047,10 @@ ] }, "predict": { - "A": 0.0004278522974345833, - "B": 0.00020210311049595475, - "C": 0.9932900667190552, - "D": 8.968271868070588e-05 + "A": 0.00045382819371297956, + "B": 6.538014713441953e-05, + "C": 0.9897609353065491, + "D": 4.7833142161834985e-05 }, "sample": { "messages": [ @@ -104085,17 +104085,17 @@ } { "metric": { - "acc": false, + "acc": true, "f1_macro": [ "A", - "C" + "A" ] }, "predict": { - "A": 0.3828880786895752, - "B": 0.03142936900258064, - "C": 0.5570988655090332, - "D": 0.0031119289342314005 + "A": 0.5588390231132507, + "B": 0.007488447241485119, + "C": 0.3840840756893158, + "D": 0.0048349048011004925 }, "sample": { "messages": [ @@ -104125,7 +104125,7 @@ "prompt_len": 86, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " C" + "generated_token": " A" } } { @@ -104137,10 +104137,10 @@ ] }, "predict": { - "A": 0.9593356847763062, - "B": 0.0023779557086527348, - "C": 0.0001520176010672003, - "D": 0.0001520176010672003 + "A": 0.8797038793563843, + "B": 0.0005513338837772608, + "C": 6.185795064084232e-05, + "D": 2.7449299523141235e-05 }, "sample": { "messages": [ @@ -104182,10 +104182,10 @@ ] }, "predict": { - "A": 0.6251398324966431, - "B": 0.01470187958329916, - "C": 0.2952951490879059, - "D": 0.005408518947660923 + "A": 0.4583349823951721, + "B": 0.005769585724920034, + "C": 0.4583349823951721, + "D": 0.0029011298902332783 }, "sample": { "messages": [ @@ -104215,7 +104215,7 @@ "prompt_len": 74, "generated_len": 1, "generated_cumulative_logprob": "TODO: calculate for hf model", - "generated_token": " A" + "generated_token": " C" } } { @@ -104227,10 +104227,10 @@ ] }, "predict": { - "A": 0.21226456761360168, - "B": 0.5769948363304138, - "C": 0.011975145898759365, - "D": 0.14588715136051178 + "A": 0.18008890748023987, + "B": 0.48953235149383545, + "C": 0.0662510022521019, + "D": 0.18008890748023987 }, "sample": { "messages": [ @@ -104272,10 +104272,10 @@ ] }, "predict": { - "A": 2.862832297978457e-05, - "B": 0.9766626358032227, - "C": 3.675949483294971e-05, - "D": 0.00019871995027642697 + "A": 2.4808394300634973e-05, + "B": 0.9009292125701904, + "C": 2.6408395569887944e-05, + "D": 0.000266715360339731 }, "sample": { "messages": [ @@ -104317,10 +104317,10 @@ ] }, "predict": { - "A": 0.1390107125043869, - "B": 0.12267651408910751, - "C": 0.7059546709060669, - "D": 0.0012026784243062139 + "A": 0.029896948486566544, + "B": 0.06329184770584106, + "C": 0.8737169504165649, + "D": 0.0005828977446071804 }, "sample": { "messages": [ @@ -104362,10 +104362,10 @@ ] }, "predict": { - "A": 0.013891485519707203, - "B": 0.004509905818849802, - "C": 0.9738683700561523, - "D": 0.0002883086272049695 + "A": 0.01392698846757412, + "B": 0.0010739340214058757, + "C": 0.9763572812080383, + "D": 0.00017531491175759584 }, "sample": { "messages": [ @@ -104407,10 +104407,10 @@ ] }, "predict": { - "A": 0.9507948160171509, - "B": 0.0072594075463712215, - "C": 0.017414415255188942, - "D": 0.0008670126553624868 + "A": 0.881402850151062, + "B": 0.0012448497582226992, + "C": 0.023488590493798256, + "D": 0.00026093467022292316 }, "sample": { "messages": [ @@ -104452,10 +104452,10 @@ ] }, "predict": { - "A": 0.9648000001907349, - "B": 0.007366338279098272, - "C": 0.0002856239734683186, - "D": 0.00022244415595196187 + "A": 0.9137453436851501, + "B": 0.0008332278812304139, + "C": 6.035882324795239e-05, + "D": 4.14839742006734e-05 }, "sample": { "messages": [ @@ -104497,10 +104497,10 @@ ] }, "predict": { - "A": 0.007142215501517057, - "B": 0.01177552342414856, - "C": 0.009170787408947945, - "D": 0.9354456663131714 + "A": 0.00400136224925518, + "B": 0.008470883592963219, + "C": 0.00747552840039134, + "D": 0.8640536069869995 }, "sample": { "messages": [ @@ -104542,10 +104542,10 @@ ] }, "predict": { - "A": 0.9575772285461426, - "B": 0.0030477584805339575, - "C": 0.00013390916865319014, - "D": 0.0003640028298832476 + "A": 0.9014615416526794, + "B": 0.0008220265153795481, + "C": 0.0003647720441222191, + "D": 0.0007722224108874798 }, "sample": { "messages": [ @@ -104587,10 +104587,10 @@ ] }, "predict": { - "A": 0.0520244725048542, - "B": 0.06680074334144592, - "C": 0.14141717553138733, - "D": 0.7181757092475891 + "A": 0.01533483061939478, + "B": 0.013532941229641438, + "C": 0.1868164837360382, + "D": 0.7388735413551331 }, "sample": { "messages": [ @@ -104632,10 +104632,10 @@ ] }, "predict": { - "A": 0.03883950412273407, - "B": 0.026693973690271378, - "C": 0.006340374238789082, - "D": 0.8839830160140991 + "A": 0.01680178940296173, + "B": 0.0014681239845231175, + "C": 0.0012956148711964488, + "D": 0.9173465967178345 }, "sample": { "messages": [ @@ -104677,10 +104677,10 @@ ] }, "predict": { - "A": 0.0024321742821484804, - "B": 0.00037298601819202304, - "C": 0.0010138810612261295, - "D": 0.9812090992927551 + "A": 0.0016250427579507232, + "B": 8.612410601926968e-05, + "C": 0.0018414146034047008, + "D": 0.9538763761520386 }, "sample": { "messages": [ @@ -104722,10 +104722,10 @@ ] }, "predict": { - "A": 0.0206123236566782, - "B": 0.3224309980869293, - "C": 0.6023803949356079, - "D": 0.029990753158926964 + "A": 0.010085893794894218, + "B": 0.051220398396253586, + "C": 0.9079031944274902, + "D": 0.008900769986212254 }, "sample": { "messages": [