如果发现自己被卡常了,很简单,只要直接把优化拉满就行了。
然后在数据没有拉满的时候,完全没必要跑 $31$ 次。直接判断最大值对应的位数即可。
#pragma GCC optimize(3)
#pragma GCC optimize("Ofast")
#pragma GCC optimize("inline")
#pragma GCC optimize("-fgcse")
#pragma GCC optimize("-fgcse-lm")
#pragma GCC optimize("-fipa-sra")
#pragma GCC optimize("-ftree-pre")
#pragma GCC optimize("-ftree-vrp")
#pragma GCC optimize("-fpeephole2")
#pragma GCC optimize("-ffast-math")
#pragma GCC optimize("-fsched-spec")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("-falign-jumps")
#pragma GCC optimize("-falign-loops")
#pragma GCC optimize("-falign-labels")
#pragma GCC optimize("-fdevirtualize")
#pragma GCC optimize("-fcaller-saves")
#pragma GCC optimize("-fcrossjumping")
#pragma GCC optimize("-fthread-jumps")
#pragma GCC optimize("-funroll-loops")
#pragma GCC optimize("-fwhole-program")
#pragma GCC optimize("-freorder-blocks")
#pragma GCC optimize("-fschedule-insns")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("-ftree-tail-merge")
#pragma GCC optimize("-fschedule-insns2")
#pragma GCC optimize("-fstrict-aliasing")
#pragma GCC optimize("-fstrict-overflow")
#pragma GCC optimize("-falign-functions")
#pragma GCC optimize("-fcse-skip-blocks")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-freorder-functions")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-fhoist-adjacent-loads")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("-funsafe-loop-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")
快读的基础上,也可以把getchar
函数换成Linux
系统中的getchar_unlocked
函数。
然后按照上面的跑法就会快很多。
注意:不要随便用__int128
类型,这样的话性能会慢的特别多。
其中9s的是唯一的题解的做法,再往上3s和6s是题解代码和我的代码优化之后的性能。
本人的代码如下:
#pragma GCC optimize(3)
#pragma GCC optimize("Ofast")
#pragma GCC optimize("inline")
#pragma GCC optimize("-fgcse")
#pragma GCC optimize("-fgcse-lm")
#pragma GCC optimize("-fipa-sra")
#pragma GCC optimize("-ftree-pre")
#pragma GCC optimize("-ftree-vrp")
#pragma GCC optimize("-fpeephole2")
#pragma GCC optimize("-ffast-math")
#pragma GCC optimize("-fsched-spec")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("-falign-jumps")
#pragma GCC optimize("-falign-loops")
#pragma GCC optimize("-falign-labels")
#pragma GCC optimize("-fdevirtualize")
#pragma GCC optimize("-fcaller-saves")
#pragma GCC optimize("-fcrossjumping")
#pragma GCC optimize("-fthread-jumps")
#pragma GCC optimize("-funroll-loops")
#pragma GCC optimize("-fwhole-program")
#pragma GCC optimize("-freorder-blocks")
#pragma GCC optimize("-fschedule-insns")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("-ftree-tail-merge")
#pragma GCC optimize("-fschedule-insns2")
#pragma GCC optimize("-fstrict-aliasing")
#pragma GCC optimize("-fstrict-overflow")
#pragma GCC optimize("-falign-functions")
#pragma GCC optimize("-fcse-skip-blocks")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-freorder-functions")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-fhoist-adjacent-loads")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("-funsafe-loop-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#define getchar getchar_unlocked
#define putchar putchar_unlocked
#define maxn 1020
#define max(a, b) ((a) > (b) ? (a) : (b))
typedef long long ll;
const ll mod = 1000000007;
ll rd() {
ll k = 0;
char c = getchar();
while (!isdigit(c))
c = getchar();
while (isdigit(c)) {
k = (k << 1) + (k << 3) + (c ^ 48);
c = getchar();
}
return k;
}
void wr(ll x) {
if (x > 9)wr(x / 10);
putchar(x % 10 + '0');
}
ll n, m;
ll d[maxn][maxn];
ll l[maxn], r[maxn], h[maxn], s[maxn];
ll max_value;
int top;
ll AmulBmodP(ll a, ll b, ll p) {
ll ret = 0;
while (b) {
if (b & 1)ret = (ret + a) % p;
a = (a << 1) % p;
b >>= 1;
}
return ret % p;
}
ll solve(int bit, int flag) {
memset(h, 0, sizeof(h));
ll ret = 0;
ll i = 0, j = 0;
for (i = 1; i <= n; ++i) {
for (j = 1; j <= m; ++j) {
h[j]++;
if ((d[i][j] & (1 << bit)) >> bit != flag) h[j] = 0;
}
top = 0;
for (j = m; j >= 1; --j) {
while (top != 0 && h[j] <= h[s[top]]) l[s[top--]] = j;
s[++top] = j;
}
while (top) l[s[top--]] = 0;
top = 0;
for (j = 1; j <= m; ++j) {
while (top != 0 && h[j] < h[s[top]]) r[s[top--]] = j;
s[++top] = j;
}
while (top) r[s[top--]] = (ll)m + 1;
for (j = 1; j <= m; j++)
ret += (j - l[j]) * (r[j] - j) * h[j];
}
return ret;
}
ll ans_and, ans_or;
ll max_bit;
int main() {
n = rd();
m = n;
for (int i = 1; i <= n; ++i)
for (int j = 1; j <= m; ++j)
d[i][j] = rd(), max_value = max(max_value, d[i][j]);
max_bit = 1;
while ((1ll << max_bit) <= max_value)max_bit++;
ll tot_reg = (n * (n + 1) * m * (m + 1)) >> 2;
for (ll i = 0; i < max_bit; ++i) {
ans_and += AmulBmodP(solve(i, 1), 1ll << i, mod), ans_and %= mod;
ans_or += AmulBmodP(tot_reg - solve(i, 0), 1ll << i, mod), ans_or %= mod;
}
wr(ans_and), putchar(' '), wr(ans_or);
}
🐂
太牛了,不知道与火车头相比如何?